Skip to content

Commit 2d03ceb

Browse files
author
drollolo
committed
Merge remote-tracking branch 'upstream/master' into sa-errors
2 parents b023837 + 34f3360 commit 2d03ceb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+485
-438
lines changed

doc/source/development/internals.rst

+2-8
Original file line numberDiff line numberDiff line change
@@ -89,16 +89,10 @@ pandas extends NumPy's type system with custom types, like ``Categorical`` or
8989
datetimes with a timezone, so we have multiple notions of "values". For 1-D
9090
containers (``Index`` classes and ``Series``) we have the following convention:
9191

92-
* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally,
93-
``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``,
94-
this returns the codes, not the array of objects.
9592
* ``cls._values`` refers is the "best possible" array. This could be an
96-
``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the
97-
process of removing the index subclasses here so that it's always an
98-
``ndarray`` or ``ExtensionArray``).
93+
``ndarray`` or ``ExtensionArray``.
9994

100-
So, for example, ``Series[category]._values`` is a ``Categorical``, while
101-
``Series[category]._ndarray_values`` is the underlying codes.
95+
So, for example, ``Series[category]._values`` is a ``Categorical``.
10296

10397
.. _ref-subclassing-pandas:
10498

doc/source/reference/extensions.rst

-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ objects.
3737
api.extensions.ExtensionArray._from_factorized
3838
api.extensions.ExtensionArray._from_sequence
3939
api.extensions.ExtensionArray._from_sequence_of_strings
40-
api.extensions.ExtensionArray._ndarray_values
4140
api.extensions.ExtensionArray._reduce
4241
api.extensions.ExtensionArray._values_for_argsort
4342
api.extensions.ExtensionArray._values_for_factorize

doc/source/whatsnew/v1.1.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ Indexing
303303
- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`)
304304
- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`)
305305
- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`)
306+
- Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`)
306307

307308
Missing
308309
^^^^^^^
@@ -342,10 +343,12 @@ I/O
342343
timestamps with ``version="2.0"`` (:issue:`31652`).
343344
- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
344345
- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
346+
- Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`)
345347
- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`)
346348
- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
347349
- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`)
348350
- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`)
351+
- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`)
349352

350353

351354
Plotting

pandas/_libs/groupby.pyx

+12-4
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,9 @@ def group_last(rank_t[:, :] out,
869869

870870
assert min_count == -1, "'min_count' only used in add and prod"
871871

872-
if not len(values) == len(labels):
872+
# TODO(cython 3.0):
873+
# Instead of `labels.shape[0]` use `len(labels)`
874+
if not len(values) == labels.shape[0]:
873875
raise AssertionError("len(index) != len(labels)")
874876

875877
nobs = np.zeros((<object>out).shape, dtype=np.int64)
@@ -960,7 +962,9 @@ def group_nth(rank_t[:, :] out,
960962

961963
assert min_count == -1, "'min_count' only used in add and prod"
962964

963-
if not len(values) == len(labels):
965+
# TODO(cython 3.0):
966+
# Instead of `labels.shape[0]` use `len(labels)`
967+
if not len(values) == labels.shape[0]:
964968
raise AssertionError("len(index) != len(labels)")
965969

966970
nobs = np.zeros((<object>out).shape, dtype=np.int64)
@@ -1254,7 +1258,9 @@ def group_max(groupby_t[:, :] out,
12541258

12551259
assert min_count == -1, "'min_count' only used in add and prod"
12561260

1257-
if not len(values) == len(labels):
1261+
# TODO(cython 3.0):
1262+
# Instead of `labels.shape[0]` use `len(labels)`
1263+
if not len(values) == labels.shape[0]:
12581264
raise AssertionError("len(index) != len(labels)")
12591265

12601266
nobs = np.zeros((<object>out).shape, dtype=np.int64)
@@ -1327,7 +1333,9 @@ def group_min(groupby_t[:, :] out,
13271333

13281334
assert min_count == -1, "'min_count' only used in add and prod"
13291335

1330-
if not len(values) == len(labels):
1336+
# TODO(cython 3.0):
1337+
# Instead of `labels.shape[0]` use `len(labels)`
1338+
if not len(values) == labels.shape[0]:
13311339
raise AssertionError("len(index) != len(labels)")
13321340

13331341
nobs = np.zeros((<object>out).shape, dtype=np.int64)

pandas/_libs/internals.pyx

+4-1
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,10 @@ cdef slice_getitem(slice slc, ind):
308308
return slice(s_start, s_stop, s_step)
309309

310310
else:
311-
return np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]
311+
# NOTE:
312+
# this is the C-optimized equivalent of
313+
# `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]`
314+
return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind]
312315

313316

314317
@cython.boundscheck(False)

pandas/_libs/parsers.pyx

+2-16
Original file line numberDiff line numberDiff line change
@@ -241,9 +241,9 @@ cdef extern from "parser/io.h":
241241
void* buffer_mmap_bytes(void *source, size_t nbytes,
242242
size_t *bytes_read, int *status)
243243

244-
void *new_file_source(char *fname, size_t buffer_size)
244+
void *new_file_source(char *fname, size_t buffer_size) except NULL
245245

246-
void *new_rd_source(object obj)
246+
void *new_rd_source(object obj) except NULL
247247

248248
int del_file_source(void *src)
249249
int del_rd_source(void *src)
@@ -667,26 +667,12 @@ cdef class TextReader:
667667
ptr = new_file_source(source, self.parser.chunksize)
668668
self.parser.cb_io = &buffer_file_bytes
669669
self.parser.cb_cleanup = &del_file_source
670-
671-
if ptr == NULL:
672-
if not os.path.exists(source):
673-
674-
raise FileNotFoundError(
675-
ENOENT,
676-
f'File {usource} does not exist',
677-
usource)
678-
raise IOError('Initializing from file failed')
679-
680670
self.parser.source = ptr
681671

682672
elif hasattr(source, 'read'):
683673
# e.g., StringIO
684674

685675
ptr = new_rd_source(source)
686-
if ptr == NULL:
687-
raise IOError('Initializing parser from file-like '
688-
'object failed')
689-
690676
self.parser.source = ptr
691677
self.parser.cb_io = &buffer_rd_bytes
692678
self.parser.cb_cleanup = &del_rd_source

pandas/_libs/src/parser/io.c

+11-8
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ The full license is in the LICENSE file, distributed with this software.
2828
void *new_file_source(char *fname, size_t buffer_size) {
2929
file_source *fs = (file_source *)malloc(sizeof(file_source));
3030
if (fs == NULL) {
31+
PyErr_NoMemory();
3132
return NULL;
3233
}
3334

@@ -41,17 +42,20 @@ void *new_file_source(char *fname, size_t buffer_size) {
4142
int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
4243
if (required == 0) {
4344
free(fs);
45+
PyErr_SetFromWindowsErr(0);
4446
return NULL;
4547
}
4648
wname = (wchar_t*)malloc(required * sizeof(wchar_t));
4749
if (wname == NULL) {
4850
free(fs);
51+
PyErr_NoMemory();
4952
return NULL;
5053
}
5154
if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) <
5255
required) {
5356
free(wname);
5457
free(fs);
58+
PyErr_SetFromWindowsErr(0);
5559
return NULL;
5660
}
5761
fs->fd = _wopen(wname, O_RDONLY | O_BINARY);
@@ -62,6 +66,7 @@ void *new_file_source(char *fname, size_t buffer_size) {
6266
#endif
6367
if (fs->fd == -1) {
6468
free(fs);
69+
PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname);
6570
return NULL;
6671
}
6772

@@ -71,6 +76,7 @@ void *new_file_source(char *fname, size_t buffer_size) {
7176
if (fs->buffer == NULL) {
7277
close(fs->fd);
7378
free(fs);
79+
PyErr_NoMemory();
7480
return NULL;
7581
}
7682

@@ -83,6 +89,10 @@ void *new_file_source(char *fname, size_t buffer_size) {
8389
void *new_rd_source(PyObject *obj) {
8490
rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
8591

92+
if (rds == NULL) {
93+
PyErr_NoMemory();
94+
return NULL;
95+
}
8696
/* hold on to this object */
8797
Py_INCREF(obj);
8898
rds->obj = obj;
@@ -220,20 +230,15 @@ void *new_mmap(char *fname) {
220230

221231
mm = (memory_map *)malloc(sizeof(memory_map));
222232
if (mm == NULL) {
223-
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
224-
return (NULL);
233+
return NULL;
225234
}
226235
mm->fd = open(fname, O_RDONLY | O_BINARY);
227236
if (mm->fd == -1) {
228-
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
229-
fname, errno);
230237
free(mm);
231238
return NULL;
232239
}
233240

234241
if (fstat(mm->fd, &stat) == -1) {
235-
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
236-
errno);
237242
close(mm->fd);
238243
free(mm);
239244
return NULL;
@@ -242,8 +247,6 @@ void *new_mmap(char *fname) {
242247

243248
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
244249
if (mm->memmap == MAP_FAILED) {
245-
/* XXX Eventually remove this print statement. */
246-
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
247250
close(mm->fd);
248251
free(mm);
249252
return NULL;

pandas/_libs/src/ujson/python/date_conversions.c

+26
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,29 @@ npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) {
116116
npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
117117
return NpyDateTimeToEpoch(npy_dt, base);
118118
}
119+
120+
/* Converts the int64_t representation of a duration to ISO; mutates len */
121+
char *int64ToIsoDuration(int64_t value, size_t *len) {
122+
pandas_timedeltastruct tds;
123+
int ret_code;
124+
125+
pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);
126+
127+
// Max theoretical length of ISO Duration with 64 bit day
128+
// as the largest unit is 70 characters + 1 for a null terminator
129+
char *result = PyObject_Malloc(71);
130+
if (result == NULL) {
131+
PyErr_NoMemory();
132+
return NULL;
133+
}
134+
135+
ret_code = make_iso_8601_timedelta(&tds, result, len);
136+
if (ret_code == -1) {
137+
PyErr_SetString(PyExc_ValueError,
138+
"Could not convert timedelta value to string");
139+
PyObject_Free(result);
140+
return NULL;
141+
}
142+
143+
return result;
144+
}

pandas/_libs/src/ujson/python/date_conversions.h

+2
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,6 @@ char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len);
2828
// Convert a Python Date/Datetime to Unix epoch with resolution base
2929
npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base);
3030

31+
char *int64ToIsoDuration(int64_t value, size_t *len);
32+
3133
#endif

0 commit comments

Comments
 (0)