-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
API: honor copy=True when passing dict to DataFrame #38939
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
cbc97f0
a135d96
5c94129
d653c54
e71e319
cafa718
c706ad6
3f9195e
1cba671
396daba
11ae1c9
b505267
a1e9b68
b70d997
09213e0
e895de0
31bda58
37a2c0c
b93d7d5
aa667a6
185cd99
701356f
948ac67
46f2fcf
0bbfec0
590c820
17a693c
fb8f32d
7835184
2136289
187499c
6f30beb
e80d57c
b0a6abd
bf942ae
95e30a5
510f697
48e359e
048e826
6a9c9f0
a17c728
5b3d419
f961378
fcee44b
0c60ae8
1468e59
b6d8b70
8b66b11
54cacfc
5ea7a75
e11ea68
65d01c7
41c4e7a
7260a72
52344bb
3ddc3d3
e6bae0f
7cab084
e8e3d84
5c44953
e32f630
abd890a
1b7f7ca
b326b5f
b7aed5d
4d20fe7
ad5485a
6bed6ac
98b6dff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -650,7 +650,7 @@ def getPeriodData(nper=None): | |
# make frame | ||
def makeTimeDataFrame(nper=None, freq="B"): | ||
data = getTimeSeriesData(nper, freq) | ||
return DataFrame(data) | ||
return DataFrame(data)._consolidate() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change should in theory no longer be needed? (assuming this was done to ensure consolidated dataframe when the default was changed to not copy, and thus result in non-consolidated dataframe) (and same for the 2 cases just below) |
||
|
||
|
||
def makeDataFrame(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,7 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._libs import internals as libinternals, lib | ||
from pandas._libs import NaT, internals as libinternals, lib | ||
from pandas._typing import ArrayLike, DtypeObj, Label, Shape | ||
from pandas.errors import PerformanceWarning | ||
from pandas.util._validators import validate_bool_kwarg | ||
|
@@ -956,7 +956,15 @@ def fast_xs(self, loc: int) -> ArrayLike: | |
# Such assignment may incorrectly coerce NaT to None | ||
# result[blk.mgr_locs] = blk._slice((slice(None), loc)) | ||
for i, rl in enumerate(blk.mgr_locs): | ||
result[rl] = blk.iget((i, loc)) | ||
out = blk.iget((i, loc)) | ||
if is_dtype_equal(blk.dtype, dtype) and dtype == "m8[ns]": | ||
# FIXME: kludge for NaT -> tdnat | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i know you mention, but this is very kludgy. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yah, im still looking for an alternative here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. de-kludged. i need to double check, but think this may solve a separate silent bug. if so ill make a separate PR for that |
||
# TODO: need a test like test_sum_nanops_timedelta | ||
# where initial DataFrame is not consolidated | ||
if out is NaT: | ||
result[rl] = np.timedelta64("NaT", "ns") | ||
continue | ||
result[rl] = out | ||
|
||
if isinstance(dtype, ExtensionDtype): | ||
result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) | ||
|
@@ -1662,7 +1670,9 @@ def fast_xs(self, loc): | |
# Constructor Helpers | ||
|
||
|
||
def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: | ||
def create_block_manager_from_blocks( | ||
blocks, axes: List[Index], consolidate: bool = True | ||
) -> BlockManager: | ||
try: | ||
if len(blocks) == 1 and not isinstance(blocks[0], Block): | ||
# if blocks[0] is of length 0, return empty blocks | ||
|
@@ -1679,7 +1689,8 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: | |
] | ||
|
||
mgr = BlockManager(blocks, axes) | ||
mgr._consolidate_inplace() | ||
if consolidate: | ||
mgr._consolidate_inplace() | ||
return mgr | ||
|
||
except ValueError as e: | ||
|
@@ -1689,7 +1700,10 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: | |
|
||
|
||
def create_block_manager_from_arrays( | ||
arrays, names: Index, axes: List[Index] | ||
arrays, | ||
names: Index, | ||
axes: List[Index], | ||
consolidate: bool = True, | ||
) -> BlockManager: | ||
assert isinstance(names, Index) | ||
assert isinstance(axes, list) | ||
|
@@ -1699,12 +1713,13 @@ def create_block_manager_from_arrays( | |
# Note: just calling extract_array breaks tests that patch PandasArray._typ. | ||
arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] | ||
try: | ||
blocks = _form_blocks(arrays, names, axes) | ||
blocks = _form_blocks(arrays, names, axes, consolidate) | ||
mgr = BlockManager(blocks, axes) | ||
mgr._consolidate_inplace() | ||
return mgr | ||
except ValueError as e: | ||
raise construction_error(len(arrays), arrays[0].shape, axes, e) | ||
if consolidate: | ||
mgr._consolidate_inplace() | ||
return mgr | ||
|
||
|
||
def construction_error(tot_items, block_shape, axes, e=None): | ||
|
@@ -1731,7 +1746,7 @@ def construction_error(tot_items, block_shape, axes, e=None): | |
# ----------------------------------------------------------------------- | ||
|
||
|
||
def _form_blocks(arrays, names: Index, axes) -> List[Block]: | ||
def _form_blocks(arrays, names: Index, axes, consolidate: bool) -> List[Block]: | ||
# put "leftover" items in float bucket, where else? | ||
# generalize? | ||
items_dict: DefaultDict[str, List] = defaultdict(list) | ||
|
@@ -1757,23 +1772,31 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: | |
|
||
blocks: List[Block] = [] | ||
if len(items_dict["FloatBlock"]): | ||
float_blocks = _multi_blockify(items_dict["FloatBlock"]) | ||
float_blocks = _multi_blockify( | ||
items_dict["FloatBlock"], consolidate=consolidate | ||
) | ||
blocks.extend(float_blocks) | ||
|
||
if len(items_dict["ComplexBlock"]): | ||
complex_blocks = _multi_blockify(items_dict["ComplexBlock"]) | ||
complex_blocks = _multi_blockify( | ||
items_dict["ComplexBlock"], consolidate=consolidate | ||
) | ||
blocks.extend(complex_blocks) | ||
|
||
if len(items_dict["TimeDeltaBlock"]): | ||
timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) | ||
timedelta_blocks = _multi_blockify( | ||
items_dict["TimeDeltaBlock"], consolidate=consolidate | ||
) | ||
blocks.extend(timedelta_blocks) | ||
|
||
if len(items_dict["IntBlock"]): | ||
int_blocks = _multi_blockify(items_dict["IntBlock"]) | ||
int_blocks = _multi_blockify(items_dict["IntBlock"], consolidate=consolidate) | ||
blocks.extend(int_blocks) | ||
|
||
if len(items_dict["DatetimeBlock"]): | ||
datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE) | ||
datetime_blocks = _simple_blockify( | ||
items_dict["DatetimeBlock"], DT64NS_DTYPE, consolidate=consolidate | ||
) | ||
blocks.extend(datetime_blocks) | ||
|
||
if len(items_dict["DatetimeTZBlock"]): | ||
|
@@ -1784,11 +1807,15 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: | |
blocks.extend(dttz_blocks) | ||
|
||
if len(items_dict["BoolBlock"]): | ||
bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_) | ||
bool_blocks = _simple_blockify( | ||
items_dict["BoolBlock"], np.bool_, consolidate=consolidate | ||
) | ||
blocks.extend(bool_blocks) | ||
|
||
if len(items_dict["ObjectBlock"]) > 0: | ||
object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_) | ||
object_blocks = _simple_blockify( | ||
items_dict["ObjectBlock"], np.object_, consolidate=consolidate | ||
) | ||
blocks.extend(object_blocks) | ||
|
||
if len(items_dict["CategoricalBlock"]) > 0: | ||
|
@@ -1827,11 +1854,14 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: | |
return blocks | ||
|
||
|
||
def _simple_blockify(tuples, dtype) -> List[Block]: | ||
def _simple_blockify(tuples, dtype, consolidate: bool) -> List[Block]: | ||
""" | ||
return a single array of a block that has a single dtype; if dtype is | ||
not None, coerce to this dtype | ||
""" | ||
if not consolidate: | ||
return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype) | ||
|
||
values, placement = _stack_arrays(tuples, dtype) | ||
|
||
# TODO: CHECK DTYPE? | ||
|
@@ -1842,8 +1872,12 @@ def _simple_blockify(tuples, dtype) -> List[Block]: | |
return [block] | ||
|
||
|
||
def _multi_blockify(tuples, dtype=None): | ||
def _multi_blockify(tuples, dtype=None, consolidate: bool = True): | ||
""" return an array of blocks that potentially have different dtypes """ | ||
|
||
if not consolidate: | ||
return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype) | ||
|
||
# group by dtype | ||
grouper = itertools.groupby(tuples, lambda x: x[2].dtype) | ||
|
||
|
@@ -1858,6 +1892,18 @@ def _multi_blockify(tuples, dtype=None): | |
return new_blocks | ||
|
||
|
||
def _tuples_to_blocks_no_consolidate(tuples, dtype: Optional[DtypeObj]) -> List[Block]: | ||
# tuples produced within _form_blocks are of the form (placement, whatever, array) | ||
if dtype is not None: | ||
return [ | ||
make_block( | ||
np.atleast_2d(x[2].astype(dtype, copy=False)), placement=x[0], ndim=2 | ||
) | ||
for x in tuples | ||
] | ||
return [make_block(np.atleast_2d(x[2]), placement=x[0], ndim=2) for x in tuples] | ||
|
||
|
||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def _stack_arrays(tuples, dtype): | ||
|
||
# fml | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1297,7 +1297,7 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) | |
f(df, 0) | ||
|
||
def test_comparison_protected_from_errstate(self): | ||
missing_df = tm.makeDataFrame() | ||
missing_df = tm.makeDataFrame()._consolidate() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are already doing this on the creation (I understand by find this fragile) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
i dont think so
I agree. Silver lining: finding the existing fragility. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actuallly you are doing this on creation, maybe you recently added. prefer NOT to do this in the tests proper (in pandas/testing is ok) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated to do the consolidation in tm.makeDataFrame |
||
missing_df.iloc[0]["A"] = np.nan | ||
with np.errstate(invalid="ignore"): | ||
expected = missing_df.values < 0 | ||
|
Uh oh!
There was an error while loading. Please reload this page.