Skip to content

Commit 14c9f3f

Browse files
Merge remote-tracking branch 'upstream/master' into GH28501
2 parents c91a875 + 492c5e0 commit 14c9f3f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1216
-1178
lines changed

asv_bench/benchmarks/algorithms.py

+12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas._libs import lib
66

77
import pandas as pd
8+
from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
89

910
from .pandas_vb_common import tm
1011

@@ -174,4 +175,15 @@ def time_argsort(self, N):
174175
self.array.argsort()
175176

176177

178+
class RemoveDuplicates:
179+
def setup(self):
180+
N = 10 ** 5
181+
na = np.arange(int(N / 2))
182+
self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
183+
self.right = np.concatenate([na, na])
184+
185+
def time_make_duplicates_of_left_unique_in_right(self):
186+
make_duplicates_of_left_unique_in_right(self.left, self.right)
187+
188+
177189
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/user_guide/merging.rst

+1-8
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ behavior:
194194
},
195195
index=[2, 3, 6, 7],
196196
)
197-
result = pd.concat([df1, df4], axis=1, sort=False)
197+
result = pd.concat([df1, df4], axis=1)
198198
199199
200200
.. ipython:: python
@@ -204,13 +204,6 @@ behavior:
204204
p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False);
205205
plt.close("all");
206206
207-
.. warning::
208-
209-
The default behavior with ``join='outer'`` is to sort the other axis
210-
(columns in this case). In a future version of pandas, the default will
211-
be to not sort. We specified ``sort=False`` to opt in to the new
212-
behavior now.
213-
214207
Here is the same thing with ``join='inner'``:
215208

216209
.. ipython:: python

doc/source/whatsnew/v1.2.0.rst

+5-1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ Other enhancements
238238
-
239239
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
240240
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
241+
- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
241242
- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
242243
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
243244
- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)
@@ -474,7 +475,8 @@ Deprecations
474475
- :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`)
475476
- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`)
476477
- :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`)
477-
- Partial slicing on unordered :class:`DatetimeIndexes` with keys, which are not in Index is deprecated and will be removed in a future version (:issue:`18531`)
478+
- Partial slicing on unordered :class:`DatetimeIndex` with keys, which are not in Index is deprecated and will be removed in a future version (:issue:`18531`)
479+
- Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`DatetimeIndex`, :class:`TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`)
478480
- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`)
479481

480482
.. ---------------------------------------------------------------------------
@@ -612,6 +614,7 @@ Indexing
612614
- Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`)
613615
- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty :class:`DataFrame` with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`)
614616
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`)
617+
- Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`)
615618

616619
Missing
617620
^^^^^^^
@@ -708,6 +711,7 @@ Reshaping
708711
- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
709712
- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`)
710713
- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
714+
- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
711715

712716
Sparse
713717
^^^^^^

pandas/core/algorithms.py

+21
Original file line numberDiff line numberDiff line change
@@ -2150,3 +2150,24 @@ def _sort_tuples(values: np.ndarray[tuple]):
21502150
arrays, _ = to_arrays(values, None)
21512151
indexer = lexsort_indexer(arrays, orders=True)
21522152
return values[indexer]
2153+
2154+
2155+
def make_duplicates_of_left_unique_in_right(
2156+
left: np.ndarray, right: np.ndarray
2157+
) -> np.ndarray:
2158+
"""
2159+
If left has duplicates, which are also duplicated in right, this duplicated values
2160+
are dropped from right, meaning that every duplicate value from left exists only
2161+
once in right.
2162+
2163+
Parameters
2164+
----------
2165+
left: ndarray
2166+
right: ndarray
2167+
2168+
Returns
2169+
-------
2170+
Duplicates of left are unique in right
2171+
"""
2172+
left_duplicates = unique(left[duplicated(left)])
2173+
return right[~(duplicated(right) & isin(right, left_duplicates))]

pandas/core/frame.py

+16-25
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@
118118
)
119119
from pandas.core.dtypes.missing import isna, notna
120120

121-
from pandas.core import algorithms, common as com, nanops, ops
121+
from pandas.core import algorithms, common as com, generic, nanops, ops
122122
from pandas.core.accessor import CachedAccessor
123123
from pandas.core.aggregation import (
124124
aggregate,
@@ -2066,6 +2066,7 @@ def _from_arrays(
20662066
)
20672067
return cls(mgr)
20682068

2069+
@doc(storage_options=generic._shared_docs["storage_options"])
20692070
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
20702071
def to_stata(
20712072
self,
@@ -2118,7 +2119,7 @@ def to_stata(
21182119
variable_labels : dict
21192120
Dictionary containing columns as keys and variable labels as
21202121
values. Each label must be 80 characters or smaller.
2121-
version : {114, 117, 118, 119, None}, default 114
2122+
version : {{114, 117, 118, 119, None}}, default 114
21222123
Version to use in the output dta file. Set to None to let pandas
21232124
decide between 118 or 119 formats depending on the number of
21242125
columns in the frame. Version 114 can be read by Stata 10 and
@@ -2147,23 +2148,17 @@ def to_stata(
21472148
compression : str or dict, default 'infer'
21482149
For on-the-fly compression of the output dta. If string, specifies
21492150
compression mode. If dict, value at key 'method' specifies
2150-
compression mode. Compression mode must be one of {'infer', 'gzip',
2151-
'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and
2151+
compression mode. Compression mode must be one of {{'infer', 'gzip',
2152+
'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and
21522153
`fname` is path-like, then detect compression from the following
21532154
extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
2154-
compression). If dict and compression mode is one of {'zip',
2155-
'gzip', 'bz2'}, or inferred as one of the above, other entries
2155+
compression). If dict and compression mode is one of {{'zip',
2156+
'gzip', 'bz2'}}, or inferred as one of the above, other entries
21562157
passed as additional compression options.
21572158
21582159
.. versionadded:: 1.1.0
21592160
2160-
storage_options : dict, optional
2161-
Extra options that make sense for a particular storage connection, e.g.
2162-
host, port, username, password, etc., if using a URL that will
2163-
be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
2164-
will be raised if providing this argument with a local path or
2165-
a file-like buffer. See the fsspec and backend storage implementation
2166-
docs for the set of allowed keys and values.
2161+
{storage_options}
21672162
21682163
.. versionadded:: 1.2.0
21692164
@@ -2186,9 +2181,9 @@ def to_stata(
21862181
21872182
Examples
21882183
--------
2189-
>>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
2184+
>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
21902185
... 'parrot'],
2191-
... 'speed': [350, 18, 361, 15]})
2186+
... 'speed': [350, 18, 361, 15]}})
21922187
>>> df.to_stata('animals.dta') # doctest: +SKIP
21932188
"""
21942189
if version not in (114, 117, 118, 119, None):
@@ -2255,6 +2250,7 @@ def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None:
22552250
@doc(
22562251
Series.to_markdown,
22572252
klass=_shared_doc_kwargs["klass"],
2253+
storage_options=_shared_docs["storage_options"],
22582254
examples="""Examples
22592255
--------
22602256
>>> df = pd.DataFrame(
@@ -2307,6 +2303,7 @@ def to_markdown(
23072303
handles.handle.writelines(result)
23082304
return None
23092305

2306+
@doc(storage_options=generic._shared_docs["storage_options"])
23102307
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
23112308
def to_parquet(
23122309
self,
@@ -2340,12 +2337,12 @@ def to_parquet(
23402337
23412338
Previously this was "fname"
23422339
2343-
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
2340+
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
23442341
Parquet library to use. If 'auto', then the option
23452342
``io.parquet.engine`` is used. The default ``io.parquet.engine``
23462343
behavior is to try 'pyarrow', falling back to 'fastparquet' if
23472344
'pyarrow' is unavailable.
2348-
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
2345+
compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
23492346
Name of the compression to use. Use ``None`` for no compression.
23502347
index : bool, default None
23512348
If ``True``, include the dataframe's index(es) in the file output.
@@ -2365,13 +2362,7 @@ def to_parquet(
23652362
23662363
.. versionadded:: 0.24.0
23672364
2368-
storage_options : dict, optional
2369-
Extra options that make sense for a particular storage connection, e.g.
2370-
host, port, username, password, etc., if using a URL that will
2371-
be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
2372-
will be raised if providing this argument with a local path or
2373-
a file-like buffer. See the fsspec and backend storage implementation
2374-
docs for the set of allowed keys and values.
2365+
{storage_options}
23752366
23762367
.. versionadded:: 1.2.0
23772368
@@ -2398,7 +2389,7 @@ def to_parquet(
23982389
23992390
Examples
24002391
--------
2401-
>>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
2392+
>>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
24022393
>>> df.to_parquet('df.parquet.gzip',
24032394
... compression='gzip') # doctest: +SKIP
24042395
>>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP

0 commit comments

Comments
 (0)