Skip to content

add default zip file name in dataframe to_csv when use 'infer' + 'zip' method #39647

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
1905fcc
enhancement to dataframe to_csv zip method,add this step to perform l…
CyberQin Feb 7, 2021
91de735
REGR: fix transform of empty DataFrame/Series (#39639)
jorisvandenbossche Feb 7, 2021
a8cf281
REG: read_excel with engine specified raises on non-path/non-buffer (…
rhshadrach Feb 7, 2021
23a9b52
DOC: Clarify behavior for Series with dict-like data and index (#39374)
phofl Feb 7, 2021
627e01f
Fixed comment for pandas.unique (#39557) (#39643)
simonjayhawkins Feb 7, 2021
c075383
CLN refactor maybe-castable (#39257)
MarcoGorelli Feb 7, 2021
4028341
CLN: Use kwargs instead of kwds in apply functions (#39625)
rhshadrach Feb 7, 2021
ba60690
[PERF] taking upper 32bit of PyObject_Hash into account (#39592)
realead Feb 7, 2021
56b1c59
REF: remove Float64Index get_loc, __contains__ (#39620)
jbrockmendel Feb 7, 2021
47e5b1c
BUG: inspect.getmembers(Series) (#38782)
topper-123 Feb 7, 2021
83de190
ASV: add benchmarks for concatenating and appending of CategoricalInd…
avinashpancham Feb 7, 2021
d23a66f
BUG: Series[int].__setitem__(mask, td64_or_dt64) incorrect casting (#…
jbrockmendel Feb 7, 2021
f63e09d
CLN: Styler HTML output adopt structured code standard (#39627)
attack68 Feb 7, 2021
362f039
Fix select_dtypes(include='int') for Windows. (#36808)
Feb 7, 2021
c7acbde
cln: redundant function (#39632)
attack68 Feb 7, 2021
8d8f763
REGR: appending to existing excel file created corrupt files (#39605)
twoertwein Feb 7, 2021
102a34c
DOC: typo in 1.2.2 whatsnew (#39646)
arw2019 Feb 7, 2021
dfb19fe
enhancement to dataframe to_csv zip method,add this step to perform l…
CyberQin Feb 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,29 @@ def setup(self):
self.a = pd.Categorical(list("aabbcd") * N)
self.b = pd.Categorical(list("bbcdjk") * N)

self.idx_a = pd.CategoricalIndex(range(N), range(N))
self.idx_b = pd.CategoricalIndex(range(N + 1), range(N + 1))
self.df_a = pd.DataFrame(range(N), columns=["a"], index=self.idx_a)
self.df_b = pd.DataFrame(range(N + 1), columns=["a"], index=self.idx_b)

def time_concat(self):
pd.concat([self.s, self.s])

def time_union(self):
union_categoricals([self.a, self.b])

def time_append_overlapping_index(self):
self.idx_a.append(self.idx_a)

def time_append_non_overlapping_index(self):
self.idx_a.append(self.idx_b)

def time_concat_overlapping_index(self):
pd.concat([self.df_a, self.df_a])

def time_concat_non_overlapping_index(self):
pd.concat([self.df_a, self.df_b])


class ValueCounts:

Expand Down
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/hash_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent):
self.s.isin(self.values_outside)


class UniqueForLargePyObjectInts:
def setup(self):
lst = [x << 32 for x in range(5000)]
self.arr = np.array(lst, dtype=np.object_)

def time_unique(self):
pd.unique(self.arr)


class IsinWithRandomFloat:
params = [
[np.float64, np.object],
Expand Down
18 changes: 4 additions & 14 deletions doc/source/development/extending.rst
Original file line number Diff line number Diff line change
Expand Up @@ -329,21 +329,11 @@ Each data structure has several *constructor properties* for returning a new
data structure as the result of an operation. By overriding these properties,
you can retain subclasses through ``pandas`` data manipulations.

There are 3 constructor properties to be defined:
There are 3 possible constructor properties to be defined on a subclass:

* ``_constructor``: Used when a manipulation result has the same dimensions as the original.
* ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing.
* ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()``.

Following table shows how ``pandas`` data structures define constructor properties by default.

=========================== ======================= =============
Property Attributes ``Series`` ``DataFrame``
=========================== ======================= =============
``_constructor`` ``Series`` ``DataFrame``
``_constructor_sliced`` ``NotImplementedError`` ``Series``
``_constructor_expanddim`` ``DataFrame`` ``NotImplementedError``
=========================== ======================= =============
* ``DataFrame/Series._constructor``: Used when a manipulation result has the same dimension as the original.
* ``DataFrame._constructor_sliced``: Used when a ``DataFrame`` (sub-)class manipulation result should be a ``Series`` (sub-)class.
* ``Series._constructor_expanddim``: Used when a ``Series`` (sub-)class manipulation result should be a ``DataFrame`` (sub-)class, e.g. ``Series.to_frame()``.

Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.

Expand Down
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v1.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ Fixed regressions

- Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`)
- Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`)
- Fixed regression in :class:`DataFrame.astype` and :class:`Series.astype` not casting to bytes dtype (:issue:`39474`)
- Fixed regression in :meth:`DataFrame.astype` and :meth:`Series.astype` not casting to bytes dtype (:issue:`39474`)
- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`)
- Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`)
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
-

.. ---------------------------------------------------------------------------
Expand Down
9 changes: 6 additions & 3 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
Other API changes
^^^^^^^^^^^^^^^^^
- Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
-
- Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -253,6 +253,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)

.. ---------------------------------------------------------------------------

Expand Down Expand Up @@ -304,6 +305,7 @@ Numeric
- Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
- Bug in :func:`select_dtypes` different behavior between Windows and Linux with ``include="int"`` (:issue:`36569`)
-

Conversion
Expand Down Expand Up @@ -338,7 +340,7 @@ Indexing
- Bug in :meth:`Series.__setitem__` raising ``ValueError`` when setting a :class:`Series` with a scalar indexer (:issue:`38303`)
- Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`)
- Bug in :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` always raising ``KeyError`` when slicing with existing strings an :class:`Index` with milliseconds (:issue:`33589`)
- Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`)
- Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`)
- Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`)
- Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`)
- Bug in :meth:`Index.get_loc` not raising ``KeyError`` when method is specified for ``NaN`` value when ``NaN`` is not in :class:`Index` (:issue:`39382`)
Expand Down Expand Up @@ -445,10 +447,11 @@ Other
- Bug in :class:`Index` constructor sometimes silently ignorning a specified ``dtype`` (:issue:`38879`)
- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`)
- Bug in :class:`Styler` which caused CSS to duplicate on multiple renders. (:issue:`39395`)
- ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`)
- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`)
- Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`)
- Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`)
-
- :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`)
-

.. ---------------------------------------------------------------------------
Expand Down
6 changes: 5 additions & 1 deletion pandas/_libs/index_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,14 @@ cdef class {{name}}Engine(IndexEngine):
cdef _make_hash_table(self, Py_ssize_t n):
return _hash.{{name}}HashTable(n)

{{if name not in {'Float64', 'Float32'} }}
cdef _check_type(self, object val):
{{if name not in {'Float64', 'Float32'} }}
if not util.is_integer_object(val):
raise KeyError(val)
{{else}}
if util.is_bool_object(val):
# avoid casting to True -> 1.0
raise KeyError(val)
{{endif}}

cdef void _call_map_locations(self, values):
Expand Down
30 changes: 25 additions & 5 deletions pandas/_libs/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,31 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
return result;
}

// For PyObject_Hash holds:
// hash(0.0) == 0 == hash(-0.0)
// hash(X) == 0 if X is a NaN-value
// so it is OK to use it directly
#define kh_python_hash_func(key) (PyObject_Hash(key))

khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){
// For PyObject_Hash holds:
// hash(0.0) == 0 == hash(-0.0)
// hash(X) == 0 if X is a NaN-value
// so it is OK to use it directly for doubles
Py_hash_t hash = PyObject_Hash(key);
if (hash == -1) {
PyErr_Clear();
return 0;
}
#if SIZEOF_PY_HASH_T == 4
// it is already 32bit value
return hash;
#else
// for 64bit builds,
// we need information of the upper 32bits as well
// see GH 37615
khuint64_t as_uint = (khuint64_t) hash;
// uints avoid undefined behavior of signed ints
return (as_uint>>32)^as_uint;
#endif
}


#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def transform(

# Functions that transform may return empty Series/DataFrame
# when the dtype is not appropriate
if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty:
raise ValueError("Transform function failed")
if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
obj.index
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,8 @@ def unique(values):
Hash table-based unique. Uniques are returned in order
of appearance. This does NOT sort.

Significantly faster than numpy.unique. Includes NA values.
Significantly faster than numpy.unique for long enough sequences.
Includes NA values.

Parameters
----------
Expand Down
40 changes: 20 additions & 20 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def frame_apply(
raw: bool = False,
result_type: Optional[str] = None,
args=None,
kwds=None,
kwargs=None,
) -> FrameApply:
""" construct and return a row or column based frame apply object """
axis = obj._get_axis_number(axis)
Expand All @@ -79,7 +79,7 @@ def frame_apply(
raw=raw,
result_type=result_type,
args=args,
kwds=kwds,
kwargs=kwargs,
)


Expand All @@ -88,14 +88,14 @@ def series_apply(
func: AggFuncType,
convert_dtype: bool = True,
args=None,
kwds=None,
kwargs=None,
) -> SeriesApply:
return SeriesApply(
obj,
func,
convert_dtype,
args,
kwds,
kwargs,
)


Expand All @@ -109,12 +109,12 @@ def __init__(
raw: bool,
result_type: Optional[str],
args,
kwds,
kwargs,
):
self.obj = obj
self.raw = raw
self.args = args or ()
self.kwds = kwds or {}
self.kwargs = kwargs or {}

if result_type not in [None, "reduce", "broadcast", "expand"]:
raise ValueError(
Expand All @@ -126,13 +126,13 @@ def __init__(

# curry if needed
if (
(kwds or args)
(kwargs or args)
and not isinstance(func, (np.ufunc, str))
and not is_list_like(func)
):

def f(x):
return func(x, *args, **kwds)
return func(x, *args, **kwargs)

else:
f = func
Expand Down Expand Up @@ -163,7 +163,7 @@ def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]:
obj = self.obj
arg = self.f
args = self.args
kwargs = self.kwds
kwargs = self.kwargs

_axis = kwargs.pop("_axis", None)
if _axis is None:
Expand Down Expand Up @@ -413,10 +413,10 @@ def maybe_apply_str(self) -> Optional[FrameOrSeriesUnion]:
if callable(func):
sig = inspect.getfullargspec(func)
if "axis" in sig.args:
self.kwds["axis"] = self.axis
self.kwargs["axis"] = self.axis
elif self.axis != 0:
raise ValueError(f"Operation {f} does not support axis=1")
return self.obj._try_aggregate_string_function(f, *self.args, **self.kwds)
return self.obj._try_aggregate_string_function(f, *self.args, **self.kwargs)

def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
"""
Expand All @@ -430,7 +430,7 @@ def maybe_apply_multiple(self) -> Optional[FrameOrSeriesUnion]:
# Note: dict-likes are list-like
if not is_list_like(self.f):
return None
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwds)
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)


class FrameApply(Apply):
Expand Down Expand Up @@ -806,7 +806,7 @@ def __init__(
func: AggFuncType,
convert_dtype: bool,
args,
kwds,
kwargs,
):
self.convert_dtype = convert_dtype

Expand All @@ -816,7 +816,7 @@ def __init__(
raw=False,
result_type=None,
args=args,
kwds=kwds,
kwargs=kwargs,
)

def apply(self) -> FrameOrSeriesUnion:
Expand Down Expand Up @@ -877,17 +877,17 @@ def __init__(
obj: Union[SeriesGroupBy, DataFrameGroupBy],
func: AggFuncType,
args,
kwds,
kwargs,
):
kwds = kwds.copy()
self.axis = obj.obj._get_axis_number(kwds.get("axis", 0))
kwargs = kwargs.copy()
self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0))
super().__init__(
obj,
func,
raw=False,
result_type=None,
args=args,
kwds=kwds,
kwargs=kwargs,
)

def apply(self):
Expand All @@ -903,15 +903,15 @@ def __init__(
obj: Union[Resampler, BaseWindow],
func: AggFuncType,
args,
kwds,
kwargs,
):
super().__init__(
obj,
func,
raw=False,
result_type=None,
args=args,
kwds=kwds,
kwargs=kwargs,
)

def apply(self):
Expand Down
10 changes: 7 additions & 3 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,9 +588,13 @@ def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bo
Otherwise an object array is returned.
"""
# perf shortcut as this is the most common case
if isinstance(arr, np.ndarray):
if maybe_castable(arr) and not copy and dtype is None:
return arr
if (
isinstance(arr, np.ndarray)
and maybe_castable(arr.dtype)
and not copy
and dtype is None
):
return arr

if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)):
# create an extension array from its dtype
Expand Down
Loading