Skip to content

Commit f5753fa

Browse files
authored
Merge pull request #252 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents 49ddb56 + e045034 commit f5753fa

30 files changed

+259
-51
lines changed

doc/source/user_guide/cookbook.rst

+3
Original file line numberDiff line numberDiff line change
@@ -1211,6 +1211,9 @@ The :ref:`Excel <io.excel>` docs
12111211
`Modifying formatting in XlsxWriter output
12121212
<https://pbpython.com/improve-pandas-excel-output.html>`__
12131213

1214+
`Loading only visible sheets
1215+
<https://github.com/pandas-dev/pandas/issues/19842#issuecomment-892150745>`__
1216+
12141217
.. _cookbook.html:
12151218

12161219
HTML

doc/source/whatsnew/v1.3.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Fixed regressions
3030

3131
Bug fixes
3232
~~~~~~~~~
33+
- Bug in :meth:`pandas.read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`)
3334
- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`)
3435
- :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`)
3536

doc/source/whatsnew/v1.4.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ Deprecations
160160
- Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`)
161161
- Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`)
162162
- Creating an empty Series without a dtype will now raise a more visible ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`30017`)
163+
- Deprecated the 'kind' argument in :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer`, :meth:`Index.slice_locs`; in a future version passing 'kind' will raise (:issue:`42857`)
164+
-
163165

164166
.. ---------------------------------------------------------------------------
165167
@@ -234,6 +236,8 @@ Indexing
234236
- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
235237
- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
236238
- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
239+
- Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
240+
-
237241

238242
Missing
239243
^^^^^^^

pandas/core/computation/parsing.py

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def create_valid_python_identifier(name: str) -> str:
4949
"!": "_EXCLAMATIONMARK_",
5050
"$": "_DOLLARSIGN_",
5151
"€": "_EUROSIGN_",
52+
"°": "_DEGREESIGN_",
5253
# Including quotes works, but there are exceptions.
5354
"'": "_SINGLEQUOTE_",
5455
'"': "_DOUBLEQUOTE_",

pandas/core/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -4840,6 +4840,8 @@ def _reindex_axes(
48404840
copy=copy,
48414841
allow_dups=False,
48424842
)
4843+
# If we've made a copy once, no need to make another one
4844+
copy = False
48434845

48444846
return obj
48454847

pandas/core/groupby/groupby.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -3031,15 +3031,19 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
30313031
if freq is not None or axis != 0:
30323032
return self.apply(lambda x: x.shift(periods, freq, axis, fill_value))
30333033

3034-
return self._get_cythonized_result(
3035-
"group_shift_indexer",
3036-
numeric_only=False,
3037-
cython_dtype=np.dtype(np.int64),
3038-
needs_ngroups=True,
3039-
result_is_index=True,
3040-
periods=periods,
3034+
ids, _, ngroups = self.grouper.group_info
3035+
res_indexer = np.zeros(len(ids), dtype=np.int64)
3036+
3037+
libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)
3038+
3039+
obj = self._obj_with_exclusions
3040+
3041+
res = obj._reindex_with_indexers(
3042+
{self.axis: (obj.axes[self.axis], res_indexer)},
30413043
fill_value=fill_value,
3044+
allow_dups=True,
30423045
)
3046+
return res
30433047

30443048
@final
30453049
@Substitution(name="groupby")

pandas/core/indexes/base.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -5839,7 +5839,7 @@ def slice_indexer(
58395839
start: Hashable | None = None,
58405840
end: Hashable | None = None,
58415841
step: int | None = None,
5842-
kind: str_t | None = None,
5842+
kind=no_default,
58435843
) -> slice:
58445844
"""
58455845
Compute the slice indexer for input labels and step.
@@ -5855,6 +5855,8 @@ def slice_indexer(
58555855
step : int, default None
58565856
kind : str, default None
58575857
5858+
.. deprecated:: 1.4.0
5859+
58585860
Returns
58595861
-------
58605862
indexer : slice
@@ -5880,6 +5882,8 @@ def slice_indexer(
58805882
>>> idx.slice_indexer(start='b', end=('c', 'g'))
58815883
slice(1, 3, None)
58825884
"""
5885+
self._deprecated_arg(kind, "kind", "slice_indexer")
5886+
58835887
start_slice, end_slice = self.slice_locs(start, end, step=step)
58845888

58855889
# return a slice
@@ -5928,6 +5932,8 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind=no_default):
59285932
side : {'left', 'right'}
59295933
kind : {'loc', 'getitem'} or None
59305934
5935+
.. deprecated:: 1.3.0
5936+
59315937
Returns
59325938
-------
59335939
label : object
@@ -5962,7 +5968,7 @@ def _searchsorted_monotonic(self, label, side: str_t = "left"):
59625968

59635969
raise ValueError("index must be monotonic increasing or decreasing")
59645970

5965-
def get_slice_bound(self, label, side: str_t, kind=None) -> int:
5971+
def get_slice_bound(self, label, side: str_t, kind=no_default) -> int:
59665972
"""
59675973
Calculate slice bound that corresponds to given label.
59685974
@@ -5975,12 +5981,15 @@ def get_slice_bound(self, label, side: str_t, kind=None) -> int:
59755981
side : {'left', 'right'}
59765982
kind : {'loc', 'getitem'} or None
59775983
5984+
.. deprecated:: 1.4.0
5985+
59785986
Returns
59795987
-------
59805988
int
59815989
Index of label.
59825990
"""
5983-
assert kind in ["loc", "getitem", None]
5991+
assert kind in ["loc", "getitem", None, no_default]
5992+
self._deprecated_arg(kind, "kind", "get_slice_bound")
59845993

59855994
if side not in ("left", "right"):
59865995
raise ValueError(
@@ -6030,7 +6039,7 @@ def get_slice_bound(self, label, side: str_t, kind=None) -> int:
60306039
else:
60316040
return slc
60326041

6033-
def slice_locs(self, start=None, end=None, step=None, kind=None):
6042+
def slice_locs(self, start=None, end=None, step=None, kind=no_default):
60346043
"""
60356044
Compute slice locations for input labels.
60366045
@@ -6044,6 +6053,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
60446053
If None, defaults to 1.
60456054
kind : {'loc', 'getitem'} or None
60466055
6056+
.. deprecated:: 1.4.0
6057+
60476058
Returns
60486059
-------
60496060
start, end : int
@@ -6062,6 +6073,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
60626073
>>> idx.slice_locs(start='b', end='c')
60636074
(1, 3)
60646075
"""
6076+
self._deprecated_arg(kind, "kind", "slice_locs")
60656077
inc = step is None or step >= 0
60666078

60676079
if not inc:

pandas/core/indexes/datetimes.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -722,14 +722,14 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default):
722722
if self._is_strictly_monotonic_decreasing and len(self) > 1:
723723
return upper if side == "left" else lower
724724
return lower if side == "left" else upper
725-
elif isinstance(label, (self._data._recognized_scalars, date)):
725+
elif isinstance(label, self._data._recognized_scalars):
726726
self._deprecate_mismatched_indexing(label)
727727
else:
728728
raise self._invalid_indexer("slice", label)
729729

730730
return self._maybe_cast_for_get_loc(label)
731731

732-
def slice_indexer(self, start=None, end=None, step=None, kind=None):
732+
def slice_indexer(self, start=None, end=None, step=None, kind=lib.no_default):
733733
"""
734734
Return indexer for specified label slice.
735735
Index.slice_indexer, customized to handle time slicing.
@@ -743,6 +743,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
743743
value-based selection in non-monotonic cases.
744744
745745
"""
746+
self._deprecated_arg(kind, "kind", "slice_indexer")
747+
746748
# For historical reasons DatetimeIndex supports slices between two
747749
# instances of datetime.time as if it were applying a slice mask to
748750
# an array of (self.hour, self.minute, self.seconds, self.microsecond).
@@ -800,6 +802,13 @@ def check_str_or_none(point):
800802
else:
801803
return indexer
802804

805+
@doc(Index.get_slice_bound)
806+
def get_slice_bound(self, label, side: str, kind=None) -> int:
807+
# GH#42855 handle date here instead of _maybe_cast_slice_bound
808+
if isinstance(label, date) and not isinstance(label, datetime):
809+
label = Timestamp(label).to_pydatetime()
810+
return super().get_slice_bound(label, side=side, kind=kind)
811+
803812
# --------------------------------------------------------------------
804813

805814
@property

pandas/core/indexes/multi.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -2586,7 +2586,7 @@ def _get_indexer_level_0(self, target) -> np.ndarray:
25862586
return ci.get_indexer_for(target)
25872587

25882588
def get_slice_bound(
2589-
self, label: Hashable | Sequence[Hashable], side: str, kind: str | None = None
2589+
self, label: Hashable | Sequence[Hashable], side: str, kind=lib.no_default
25902590
) -> int:
25912591
"""
25922592
For an ordered MultiIndex, compute slice bound
@@ -2601,6 +2601,8 @@ def get_slice_bound(
26012601
side : {'left', 'right'}
26022602
kind : {'loc', 'getitem', None}
26032603
2604+
.. deprecated:: 1.4.0
2605+
26042606
Returns
26052607
-------
26062608
int
@@ -2632,11 +2634,13 @@ def get_slice_bound(
26322634
MultiIndex.get_locs : Get location for a label/slice/list/mask or a
26332635
sequence of such.
26342636
"""
2637+
self._deprecated_arg(kind, "kind", "get_slice_bound")
2638+
26352639
if not isinstance(label, tuple):
26362640
label = (label,)
26372641
return self._partial_tup_index(label, side=side)
26382642

2639-
def slice_locs(self, start=None, end=None, step=None, kind=None):
2643+
def slice_locs(self, start=None, end=None, step=None, kind=lib.no_default):
26402644
"""
26412645
For an ordered MultiIndex, compute the slice locations for input
26422646
labels.
@@ -2655,6 +2659,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
26552659
Slice step
26562660
kind : string, optional, defaults None
26572661
2662+
.. deprecated:: 1.4.0
2663+
26582664
Returns
26592665
-------
26602666
(start, end) : (int, int)
@@ -2688,6 +2694,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
26882694
MultiIndex.get_locs : Get location for a label/slice/list/mask or a
26892695
sequence of such.
26902696
"""
2697+
self._deprecated_arg(kind, "kind", "slice_locs")
26912698
# This function adds nothing to its parent implementation (the magic
26922699
# happens in get_slice_bound method), but it adds meaningful doc.
26932700
return super().slice_locs(start, end, step)

pandas/core/indexes/numeric.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def _convert_slice_indexer(self, key: slice, kind: str):
244244

245245
# We always treat __getitem__ slicing as label-based
246246
# translate to locations
247-
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
247+
return self.slice_indexer(key.start, key.stop, key.step)
248248

249249
return super()._convert_slice_indexer(key, kind=kind)
250250

pandas/core/internals/array_manager.py

+2
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,8 @@ def _reindex_indexer(
601601
)
602602
else:
603603
arr = self.arrays[i]
604+
if copy:
605+
arr = arr.copy()
604606
new_arrays.append(arr)
605607

606608
else:

pandas/core/sample.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,11 @@ def preprocess_weights(obj: FrameOrSeries, weights, axis: int) -> np.ndarray:
6363
if (weights < 0).any():
6464
raise ValueError("weight vector many not include negative values")
6565

66-
weights[np.isnan(weights)] = 0
66+
missing = np.isnan(weights)
67+
if missing.any():
68+
# Don't modify weights in place
69+
weights = weights.copy()
70+
weights[missing] = 0
6771
return weights
6872

6973

pandas/io/common.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
from collections import abc
77
import dataclasses
88
import gzip
9+
import io
910
from io import (
1011
BufferedIOBase,
1112
BytesIO,
1213
RawIOBase,
1314
StringIO,
15+
TextIOBase,
1416
TextIOWrapper,
1517
)
1618
import mmap
@@ -50,7 +52,6 @@
5052

5153
lzma = import_lzma()
5254

53-
5455
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
5556
_VALID_URLS.discard("")
5657

@@ -102,7 +103,7 @@ def close(self) -> None:
102103
avoid closing the potentially user-created buffer.
103104
"""
104105
if self.is_wrapped:
105-
assert isinstance(self.handle, TextIOWrapper)
106+
assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper))
106107
self.handle.flush()
107108
self.handle.detach()
108109
self.created_handles.remove(self.handle)
@@ -712,7 +713,16 @@ def get_handle(
712713

713714
# Convert BytesIO or file objects passed with an encoding
714715
is_wrapped = False
715-
if is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
716+
if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
717+
handle = BytesIOWrapper(
718+
handle,
719+
encoding=ioargs.encoding,
720+
)
721+
handles.append(handle)
722+
# the (text) handle is always provided by the caller
723+
# since get_handle would have opened it in binary mode
724+
is_wrapped = True
725+
elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
716726
handle = TextIOWrapper(
717727
# error: Argument 1 to "TextIOWrapper" has incompatible type
718728
# "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
@@ -878,6 +888,46 @@ def __next__(self) -> str:
878888
return newline.lstrip("\n")
879889

880890

891+
# Wrapper that wraps a StringIO buffer and reads bytes from it
892+
# Created for compat with pyarrow read_csv
893+
class BytesIOWrapper(io.BytesIO):
894+
buffer: StringIO | TextIOBase | None
895+
896+
def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"):
897+
self.buffer = buffer
898+
self.encoding = encoding
899+
# Because a character can be represented by more than 1 byte,
900+
# it is possible that reading will produce more bytes than n
901+
# We store the extra bytes in this overflow variable, and append the
902+
# overflow to the front of the bytestring the next time reading is performed
903+
self.overflow = b""
904+
905+
def __getattr__(self, attr: str):
906+
return getattr(self.buffer, attr)
907+
908+
def read(self, n: int | None = -1) -> bytes:
909+
assert self.buffer is not None
910+
bytestring = self.buffer.read(n).encode(self.encoding)
911+
# When n=-1/n greater than remaining bytes: Read entire file/rest of file
912+
combined_bytestring = self.overflow + bytestring
913+
if n is None or n < 0 or n >= len(combined_bytestring):
914+
self.overflow = b""
915+
return combined_bytestring
916+
else:
917+
to_return = combined_bytestring[:n]
918+
self.overflow = combined_bytestring[n:]
919+
return to_return
920+
921+
def detach(self):
922+
# Slightly modified from Python's TextIOWrapper detach method
923+
if self.buffer is None:
924+
raise ValueError("buffer is already detached")
925+
self.flush()
926+
buffer = self.buffer
927+
self.buffer = None
928+
return buffer
929+
930+
881931
def _maybe_memory_map(
882932
handle: FileOrBuffer,
883933
memory_map: bool,

0 commit comments

Comments
 (0)