Skip to content

Commit 559040d

Browse files
committed
Merge remote-tracking branch 'upstream/master' into pd_array-in-core
2 parents 0551498 + 9903a54 commit 559040d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+396
-240
lines changed

asv_bench/benchmarks/reshape.py

+36
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,42 @@ def time_unstack(self):
5353
self.df.unstack(1)
5454

5555

56+
class ReshapeExtensionDtype:
57+
58+
params = ["datetime64[ns, US/Pacific]", "Period[s]"]
59+
param_names = ["dtype"]
60+
61+
def setup(self, dtype):
62+
lev = pd.Index(list("ABCDEFGHIJ"))
63+
ri = pd.Index(range(1000))
64+
mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"])
65+
66+
index = date_range("2016-01-01", periods=10000, freq="s", tz="US/Pacific")
67+
if dtype == "Period[s]":
68+
index = index.tz_localize(None).to_period("s")
69+
70+
ser = pd.Series(index, index=mi)
71+
df = ser.unstack("bar")
72+
# roundtrips -> df.stack().equals(ser)
73+
74+
self.ser = ser
75+
self.df = df
76+
77+
def time_stack(self, dtype):
78+
self.df.stack()
79+
80+
def time_unstack_fast(self, dtype):
81+
# last level -> doesnt have to make copies
82+
self.ser.unstack("bar")
83+
84+
def time_unstack_slow(self, dtype):
85+
# first level -> must make copies
86+
self.ser.unstack("foo")
87+
88+
def time_transpose(self, dtype):
89+
self.df.T
90+
91+
5692
class Unstack:
5793

5894
params = ["int", "category"]

doc/source/user_guide/window.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ The following formula is used to compute exponentially weighted mean with an inp
581581

582582
.. math::
583583
584-
y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}},
584+
y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda}},
585585
586586
587587
ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how

doc/source/whatsnew/v1.3.0.rst

+5
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,9 @@ Other enhancements
140140
- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
141141
- Add support for parsing ``ISO 8601``-like timestamps with negative signs to :meth:`pandas.Timedelta` (:issue:`37172`)
142142
- Add support for unary operators in :class:`FloatingArray` (:issue:`38749`)
143+
- :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
143144
- :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
145+
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
144146

145147
.. ---------------------------------------------------------------------------
146148
@@ -437,6 +439,7 @@ Timezones
437439
Numeric
438440
^^^^^^^
439441
- Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`)
442+
- Bug in :meth:`DataFrame.sort_values` raising an :class:`IndexError` for empty ``by`` (:issue:`40258`)
440443
- Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)
441444
- Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
442445
- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
@@ -534,6 +537,7 @@ I/O
534537
- :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`)
535538
- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
536539
- Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
540+
- Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
537541

538542
Period
539543
^^^^^^
@@ -570,6 +574,7 @@ Groupby/resample/rolling
570574
- Bug in :meth:`DataFrameGroupBy.sample` where column selection was not applied to sample result (:issue:`39928`)
571575
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would incorrectly raise a ``ValueError`` when providing ``times`` (:issue:`40164`)
572576
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`)
577+
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
573578

574579
Reshaping
575580
^^^^^^^^^

pandas/_libs/parsers.pyx

+28-25
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,19 @@ from libc.string cimport (
2020
import cython
2121
from cython import Py_ssize_t
2222

23-
from cpython.bytes cimport PyBytes_AsString
23+
from cpython.bytes cimport (
24+
PyBytes_AsString,
25+
PyBytes_FromString,
26+
)
2427
from cpython.exc cimport (
2528
PyErr_Fetch,
2629
PyErr_Occurred,
2730
)
2831
from cpython.object cimport PyObject
29-
from cpython.ref cimport Py_XDECREF
32+
from cpython.ref cimport (
33+
Py_INCREF,
34+
Py_XDECREF,
35+
)
3036
from cpython.unicode cimport (
3137
PyUnicode_AsUTF8String,
3238
PyUnicode_Decode,
@@ -143,7 +149,7 @@ cdef extern from "parser/tokenizer.h":
143149
enum: ERROR_OVERFLOW
144150

145151
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
146-
int *status)
152+
int *status, const char *encoding_errors)
147153
ctypedef int (*io_cleanup)(void *src)
148154

149155
ctypedef struct parser_t:
@@ -255,8 +261,8 @@ cdef extern from "parser/tokenizer.h":
255261

256262
int parser_trim_buffers(parser_t *self)
257263

258-
int tokenize_all_rows(parser_t *self) nogil
259-
int tokenize_nrows(parser_t *self, size_t nrows) nogil
264+
int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
265+
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
260266

261267
int64_t str_to_int64(char *p_item, int64_t int_min,
262268
int64_t int_max, int *error, char tsep) nogil
@@ -293,7 +299,7 @@ cdef extern from "parser/io.h":
293299
size_t *bytes_read, int *status)
294300

295301
void* buffer_rd_bytes(void *source, size_t nbytes,
296-
size_t *bytes_read, int *status)
302+
size_t *bytes_read, int *status, const char *encoding_errors)
297303

298304

299305
cdef class TextReader:
@@ -316,6 +322,7 @@ cdef class TextReader:
316322
uint64_t parser_start
317323
list clocks
318324
char *c_encoding
325+
const char *encoding_errors
319326
kh_str_starts_t *false_set
320327
kh_str_starts_t *true_set
321328

@@ -370,10 +377,15 @@ cdef class TextReader:
370377
bint verbose=False,
371378
bint mangle_dupe_cols=True,
372379
float_precision=None,
373-
bint skip_blank_lines=True):
380+
bint skip_blank_lines=True,
381+
encoding_errors=b"strict"):
374382

375383
# set encoding for native Python and C library
376384
self.c_encoding = NULL
385+
if isinstance(encoding_errors, str):
386+
encoding_errors = encoding_errors.encode("utf-8")
387+
Py_INCREF(encoding_errors)
388+
self.encoding_errors = PyBytes_AsString(encoding_errors)
377389

378390
self.parser = parser_new()
379391
self.parser.chunksize = tokenize_chunksize
@@ -558,13 +570,7 @@ cdef class TextReader:
558570
pass
559571

560572
def __dealloc__(self):
561-
parser_free(self.parser)
562-
if self.true_set:
563-
kh_destroy_str_starts(self.true_set)
564-
self.true_set = NULL
565-
if self.false_set:
566-
kh_destroy_str_starts(self.false_set)
567-
self.false_set = NULL
573+
self.close()
568574
parser_del(self.parser)
569575

570576
def close(self):
@@ -632,7 +638,6 @@ cdef class TextReader:
632638
char *word
633639
object name, old_name
634640
uint64_t hr, data_line = 0
635-
char *errors = "strict"
636641
StringPath path = _string_path(self.c_encoding)
637642
list header = []
638643
set unnamed_cols = set()
@@ -673,11 +678,8 @@ cdef class TextReader:
673678
for i in range(field_count):
674679
word = self.parser.words[start + i]
675680

676-
if path == UTF8:
677-
name = PyUnicode_FromString(word)
678-
elif path == ENCODED:
679-
name = PyUnicode_Decode(word, strlen(word),
680-
self.c_encoding, errors)
681+
name = PyUnicode_Decode(word, strlen(word),
682+
self.c_encoding, self.encoding_errors)
681683

682684
# We use this later when collecting placeholder names.
683685
old_name = name
@@ -831,7 +833,7 @@ cdef class TextReader:
831833
int status
832834

833835
with nogil:
834-
status = tokenize_nrows(self.parser, nrows)
836+
status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
835837

836838
if self.parser.warn_msg != NULL:
837839
print(self.parser.warn_msg, file=sys.stderr)
@@ -859,7 +861,7 @@ cdef class TextReader:
859861
'the whole file')
860862
else:
861863
with nogil:
862-
status = tokenize_all_rows(self.parser)
864+
status = tokenize_all_rows(self.parser, self.encoding_errors)
863865

864866
if self.parser.warn_msg != NULL:
865867
print(self.parser.warn_msg, file=sys.stderr)
@@ -1201,7 +1203,7 @@ cdef class TextReader:
12011203

12021204
if path == UTF8:
12031205
return _string_box_utf8(self.parser, i, start, end, na_filter,
1204-
na_hashset)
1206+
na_hashset, self.encoding_errors)
12051207
elif path == ENCODED:
12061208
return _string_box_decode(self.parser, i, start, end,
12071209
na_filter, na_hashset, self.c_encoding)
@@ -1352,7 +1354,8 @@ cdef inline StringPath _string_path(char *encoding):
13521354

13531355
cdef _string_box_utf8(parser_t *parser, int64_t col,
13541356
int64_t line_start, int64_t line_end,
1355-
bint na_filter, kh_str_starts_t *na_hashset):
1357+
bint na_filter, kh_str_starts_t *na_hashset,
1358+
const char *encoding_errors):
13561359
cdef:
13571360
int error, na_count = 0
13581361
Py_ssize_t i, lines
@@ -1391,7 +1394,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
13911394
pyval = <object>table.vals[k]
13921395
else:
13931396
# box it. new ref?
1394-
pyval = PyUnicode_FromString(word)
1397+
pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)
13951398

13961399
k = kh_put_strbox(table, word, &ret)
13971400
table.vals[k] = <PyObject *>pyval

pandas/_libs/src/parser/io.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
163163
}
164164

165165
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
166-
int *status) {
166+
int *status, const char *encoding_errors) {
167167
PyGILState_STATE state;
168168
PyObject *result, *func, *args, *tmp;
169169

@@ -191,7 +191,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
191191
*status = CALLING_READ_FAILED;
192192
return NULL;
193193
} else if (!PyBytes_Check(result)) {
194-
tmp = PyUnicode_AsUTF8String(result);
194+
tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
195195
Py_DECREF(result);
196196
if (tmp == NULL) {
197197
PyGILState_Release(state);

pandas/_libs/src/parser/io.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,6 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
6464
int *status);
6565

6666
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
67-
int *status);
67+
int *status, const char *encoding_errors);
6868

6969
#endif // PANDAS__LIBS_SRC_PARSER_IO_H_

pandas/_libs/src/parser/tokenizer.c

+12-8
Original file line numberDiff line numberDiff line change
@@ -553,13 +553,15 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
553553
return 0;
554554
}
555555

556-
static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
556+
static int parser_buffer_bytes(parser_t *self, size_t nbytes,
557+
const char *encoding_errors) {
557558
int status;
558559
size_t bytes_read;
559560

560561
status = 0;
561562
self->datapos = 0;
562-
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
563+
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status,
564+
encoding_errors);
563565
TRACE((
564566
"parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
565567
nbytes, bytes_read, status));
@@ -1334,7 +1336,8 @@ int parser_trim_buffers(parser_t *self) {
13341336
all : tokenize all the data vs. certain number of rows
13351337
*/
13361338

1337-
int _tokenize_helper(parser_t *self, size_t nrows, int all) {
1339+
int _tokenize_helper(parser_t *self, size_t nrows, int all,
1340+
const char *encoding_errors) {
13381341
int status = 0;
13391342
uint64_t start_lines = self->lines;
13401343

@@ -1350,7 +1353,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
13501353
if (!all && self->lines - start_lines >= nrows) break;
13511354

13521355
if (self->datapos == self->datalen) {
1353-
status = parser_buffer_bytes(self, self->chunksize);
1356+
status = parser_buffer_bytes(self, self->chunksize,
1357+
encoding_errors);
13541358

13551359
if (status == REACHED_EOF) {
13561360
// close out last line
@@ -1383,13 +1387,13 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
13831387
return status;
13841388
}
13851389

1386-
int tokenize_nrows(parser_t *self, size_t nrows) {
1387-
int status = _tokenize_helper(self, nrows, 0);
1390+
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) {
1391+
int status = _tokenize_helper(self, nrows, 0, encoding_errors);
13881392
return status;
13891393
}
13901394

1391-
int tokenize_all_rows(parser_t *self) {
1392-
int status = _tokenize_helper(self, -1, 1);
1395+
int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
1396+
int status = _tokenize_helper(self, -1, 1, encoding_errors);
13931397
return status;
13941398
}
13951399

pandas/_libs/src/parser/tokenizer.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ typedef enum {
8585
} QuoteStyle;
8686

8787
typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
88-
int *status);
88+
int *status, const char *encoding_errors);
8989
typedef int (*io_cleanup)(void *src);
9090

9191
typedef struct parser_t {
@@ -196,9 +196,9 @@ void parser_del(parser_t *self);
196196

197197
void parser_set_default_options(parser_t *self);
198198

199-
int tokenize_nrows(parser_t *self, size_t nrows);
199+
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors);
200200

201-
int tokenize_all_rows(parser_t *self);
201+
int tokenize_all_rows(parser_t *self, const char *encoding_errors);
202202

203203
// Have parsed / type-converted a chunk of data
204204
// and want to free memory from the token stream

pandas/core/frame.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5818,7 +5818,7 @@ def sort_values( # type: ignore[override]
58185818
keys, orders=ascending, na_position=na_position, key=key
58195819
)
58205820
indexer = ensure_platform_int(indexer)
5821-
else:
5821+
elif len(by):
58225822

58235823
by = by[0]
58245824
k = self._get_label_or_level_values(by, axis=axis)
@@ -5833,6 +5833,8 @@ def sort_values( # type: ignore[override]
58335833
indexer = nargsort(
58345834
k, kind=kind, ascending=ascending, na_position=na_position, key=key
58355835
)
5836+
else:
5837+
return self.copy()
58365838

58375839
new_data = self._mgr.take(
58385840
indexer, axis=self._get_block_manager_axis(axis), verify=False

pandas/core/groupby/generic.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,7 @@
8181
validate_func_kwargs,
8282
)
8383
from pandas.core.apply import GroupByApply
84-
from pandas.core.arrays import (
85-
Categorical,
86-
ExtensionArray,
87-
)
84+
from pandas.core.arrays import Categorical
8885
from pandas.core.base import (
8986
DataError,
9087
SpecificationError,
@@ -1123,8 +1120,7 @@ def py_fallback(values: ArrayLike) -> ArrayLike:
11231120
obj: FrameOrSeriesUnion
11241121

11251122
# call our grouper again with only this block
1126-
if isinstance(values, ExtensionArray) or values.ndim == 1:
1127-
# TODO(EA2D): special case not needed with 2D EAs
1123+
if values.ndim == 1:
11281124
obj = Series(values)
11291125
else:
11301126
# TODO special case not needed with ArrayManager

0 commit comments

Comments
 (0)