Skip to content

Commit b8c42bd

Browse files
committed
Merge remote-tracking branch 'upstream/master' into map-na-action
2 parents c3dd5b9 + f0fc6dd commit b8c42bd

File tree

110 files changed

+2484
-2099
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+2484
-2099
lines changed

asv_bench/benchmarks/frame_ctor.py

+45
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22

3+
import pandas as pd
34
from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range
45

56
from .pandas_vb_common import tm
@@ -118,4 +119,48 @@ def time_frame_from_range(self):
118119
self.df = DataFrame(self.data)
119120

120121

122+
class FromArrays:
123+
124+
goal_time = 0.2
125+
126+
def setup(self):
127+
N_rows = 1000
128+
N_cols = 1000
129+
self.float_arrays = [np.random.randn(N_rows) for _ in range(N_cols)]
130+
self.sparse_arrays = [
131+
pd.arrays.SparseArray(np.random.randint(0, 2, N_rows), dtype="float64")
132+
for _ in range(N_cols)
133+
]
134+
self.int_arrays = [
135+
pd.array(np.random.randint(1000, size=N_rows), dtype="Int64")
136+
for _ in range(N_cols)
137+
]
138+
self.index = pd.Index(range(N_rows))
139+
self.columns = pd.Index(range(N_cols))
140+
141+
def time_frame_from_arrays_float(self):
142+
self.df = DataFrame._from_arrays(
143+
self.float_arrays,
144+
index=self.index,
145+
columns=self.columns,
146+
verify_integrity=False,
147+
)
148+
149+
def time_frame_from_arrays_int(self):
150+
self.df = DataFrame._from_arrays(
151+
self.int_arrays,
152+
index=self.index,
153+
columns=self.columns,
154+
verify_integrity=False,
155+
)
156+
157+
def time_frame_from_arrays_sparse(self):
158+
self.df = DataFrame._from_arrays(
159+
self.sparse_arrays,
160+
index=self.index,
161+
columns=self.columns,
162+
verify_integrity=False,
163+
)
164+
165+
121166
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@
195195

196196
# The theme to use for HTML and HTML Help pages. Major themes that come with
197197
# Sphinx are currently 'default' and 'sphinxdoc'.
198-
html_theme = "pandas_sphinx_theme"
198+
html_theme = "pydata_sphinx_theme"
199199

200200
# The style sheet to use for HTML and HTML Help pages. A file of that name
201201
# must exist either in Sphinx' static/ path, or in one of the custom paths

doc/source/getting_started/intro_tutorials/03_subset_data.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ name of the column of interest.
8888
</ul>
8989

9090
Each column in a :class:`DataFrame` is a :class:`Series`. As a single column is
91-
selected, the returned object is a pandas :class:`DataFrame`. We can verify this
91+
selected, the returned object is a pandas :class:`Series`. We can verify this
9292
by checking the type of the output:
9393

9494
.. ipython:: python

doc/source/index.rst.template

-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ programming language.
119119
:titlesonly:
120120
{% endif %}
121121
{% if not single_doc %}
122-
What's New in 1.1.0 <whatsnew/v1.1.0>
123122
getting_started/index
124123
user_guide/index
125124
{% endif -%}

doc/source/user_guide/scale.rst

+1
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ We'll import ``dask.dataframe`` and notice that the API feels similar to pandas.
246246
We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in.
247247

248248
.. ipython:: python
249+
:okwarning:
249250
250251
import dask.dataframe as dd
251252

doc/source/whatsnew/v1.1.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -343,10 +343,12 @@ I/O
343343
timestamps with ``version="2.0"`` (:issue:`31652`).
344344
- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
345345
- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
346+
- Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`)
346347
- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`)
347348
- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
348349
- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`)
349350
- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`)
351+
- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`)
350352

351353

352354
Plotting
@@ -404,6 +406,7 @@ Other
404406
- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`)
405407
- Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`)
406408
- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`)
409+
- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
407410

408411
.. ---------------------------------------------------------------------------
409412

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,5 @@ dependencies:
104104
- pyreadstat # pandas.read_spss
105105
- tabulate>=0.8.3 # DataFrame.to_markdown
106106
- pip:
107-
- git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
107+
- git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master
108108
- git+https://github.com/numpy/numpydoc

pandas/_libs/hashtable_class_helper.pxi.in

+8-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1212
from pandas._libs.tslibs.util cimport get_c_string
1313
from pandas._libs.missing cimport C_NA
1414

15+
cdef extern from "Python.h":
16+
void PyErr_Clear()
17+
1518
{{py:
1619

1720
# name, dtype, c_type
@@ -193,7 +196,7 @@ cdef class StringVector:
193196

194197
append_data_string(self.data, x)
195198

196-
cdef extend(self, ndarray[:] x):
199+
cdef extend(self, ndarray[object] x):
197200
for i in range(len(x)):
198201
self.append(x[i])
199202

@@ -238,7 +241,7 @@ cdef class ObjectVector:
238241
self.external_view_exists = True
239242
return self.ao
240243

241-
cdef extend(self, ndarray[:] x):
244+
cdef extend(self, ndarray[object] x):
242245
for i in range(len(x)):
243246
self.append(x[i])
244247

@@ -790,6 +793,9 @@ cdef class StringHashTable(HashTable):
790793
else:
791794
# if ignore_na is False, we also stringify NaN/None/etc.
792795
v = get_c_string(<str>val)
796+
if v == NULL:
797+
PyErr_Clear()
798+
v = get_c_string(<str>repr(val))
793799
vecs[i] = v
794800

795801
# compute

pandas/_libs/internals.pyx

+17-14
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@ cdef class BlockPlacement:
3232
self._has_slice = False
3333
self._has_array = False
3434

35-
if isinstance(val, slice):
35+
if isinstance(val, int):
36+
slc = slice(val, val + 1, 1)
37+
self._as_slice = slc
38+
self._has_slice = True
39+
elif isinstance(val, slice):
3640
slc = slice_canonize(val)
3741

3842
if slc.start != slc.stop:
@@ -378,25 +382,23 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True):
378382

379383
object blkno
380384
object group_dict = defaultdict(list)
381-
int64_t[:] res_view
382385

383386
n = blknos.shape[0]
384-
385-
if n == 0:
386-
return
387-
387+
result = list()
388388
start = 0
389389
cur_blkno = blknos[start]
390390

391-
if group is False:
391+
if n == 0:
392+
pass
393+
elif group is False:
392394
for i in range(1, n):
393395
if blknos[i] != cur_blkno:
394-
yield cur_blkno, slice(start, i)
396+
result.append((cur_blkno, slice(start, i)))
395397

396398
start = i
397399
cur_blkno = blknos[i]
398400

399-
yield cur_blkno, slice(start, n)
401+
result.append((cur_blkno, slice(start, n)))
400402
else:
401403
for i in range(1, n):
402404
if blknos[i] != cur_blkno:
@@ -409,19 +411,20 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True):
409411

410412
for blkno, slices in group_dict.items():
411413
if len(slices) == 1:
412-
yield blkno, slice(slices[0][0], slices[0][1])
414+
result.append((blkno, slice(slices[0][0], slices[0][1])))
413415
else:
414416
tot_len = sum(stop - start for start, stop in slices)
415-
result = np.empty(tot_len, dtype=np.int64)
416-
res_view = result
417+
arr = np.empty(tot_len, dtype=np.int64)
417418

418419
i = 0
419420
for start, stop in slices:
420421
for diff in range(start, stop):
421-
res_view[i] = diff
422+
arr[i] = diff
422423
i += 1
423424

424-
yield blkno, result
425+
result.append((blkno, arr))
426+
427+
return result
425428

426429

427430
def get_blkno_placements(blknos, group: bool = True):

pandas/_libs/parsers.pyx

+2-17
Original file line numberDiff line numberDiff line change
@@ -241,9 +241,9 @@ cdef extern from "parser/io.h":
241241
void* buffer_mmap_bytes(void *source, size_t nbytes,
242242
size_t *bytes_read, int *status)
243243

244-
void *new_file_source(char *fname, size_t buffer_size)
244+
void *new_file_source(char *fname, size_t buffer_size) except NULL
245245

246-
void *new_rd_source(object obj)
246+
void *new_rd_source(object obj) except NULL
247247

248248
int del_file_source(void *src)
249249
int del_rd_source(void *src)
@@ -667,26 +667,12 @@ cdef class TextReader:
667667
ptr = new_file_source(source, self.parser.chunksize)
668668
self.parser.cb_io = &buffer_file_bytes
669669
self.parser.cb_cleanup = &del_file_source
670-
671-
if ptr == NULL:
672-
if not os.path.exists(source):
673-
674-
raise FileNotFoundError(
675-
ENOENT,
676-
f'File {usource} does not exist',
677-
usource)
678-
raise IOError('Initializing from file failed')
679-
680670
self.parser.source = ptr
681671

682672
elif hasattr(source, 'read'):
683673
# e.g., StringIO
684674

685675
ptr = new_rd_source(source)
686-
if ptr == NULL:
687-
raise IOError('Initializing parser from file-like '
688-
'object failed')
689-
690676
self.parser.source = ptr
691677
self.parser.cb_io = &buffer_rd_bytes
692678
self.parser.cb_cleanup = &del_rd_source
@@ -806,7 +792,6 @@ cdef class TextReader:
806792
self._tokenize_rows(1)
807793

808794
header = [ self.names ]
809-
data_line = 0
810795

811796
if self.parser.lines < 1:
812797
field_count = len(header[0])

pandas/_libs/src/parser/io.c

+11-8
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ The full license is in the LICENSE file, distributed with this software.
2828
void *new_file_source(char *fname, size_t buffer_size) {
2929
file_source *fs = (file_source *)malloc(sizeof(file_source));
3030
if (fs == NULL) {
31+
PyErr_NoMemory();
3132
return NULL;
3233
}
3334

@@ -41,17 +42,20 @@ void *new_file_source(char *fname, size_t buffer_size) {
4142
int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
4243
if (required == 0) {
4344
free(fs);
45+
PyErr_SetFromWindowsErr(0);
4446
return NULL;
4547
}
4648
wname = (wchar_t*)malloc(required * sizeof(wchar_t));
4749
if (wname == NULL) {
4850
free(fs);
51+
PyErr_NoMemory();
4952
return NULL;
5053
}
5154
if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) <
5255
required) {
5356
free(wname);
5457
free(fs);
58+
PyErr_SetFromWindowsErr(0);
5559
return NULL;
5660
}
5761
fs->fd = _wopen(wname, O_RDONLY | O_BINARY);
@@ -62,6 +66,7 @@ void *new_file_source(char *fname, size_t buffer_size) {
6266
#endif
6367
if (fs->fd == -1) {
6468
free(fs);
69+
PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname);
6570
return NULL;
6671
}
6772

@@ -71,6 +76,7 @@ void *new_file_source(char *fname, size_t buffer_size) {
7176
if (fs->buffer == NULL) {
7277
close(fs->fd);
7378
free(fs);
79+
PyErr_NoMemory();
7480
return NULL;
7581
}
7682

@@ -83,6 +89,10 @@ void *new_file_source(char *fname, size_t buffer_size) {
8389
void *new_rd_source(PyObject *obj) {
8490
rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
8591

92+
if (rds == NULL) {
93+
PyErr_NoMemory();
94+
return NULL;
95+
}
8696
/* hold on to this object */
8797
Py_INCREF(obj);
8898
rds->obj = obj;
@@ -220,20 +230,15 @@ void *new_mmap(char *fname) {
220230

221231
mm = (memory_map *)malloc(sizeof(memory_map));
222232
if (mm == NULL) {
223-
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
224-
return (NULL);
233+
return NULL;
225234
}
226235
mm->fd = open(fname, O_RDONLY | O_BINARY);
227236
if (mm->fd == -1) {
228-
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n",
229-
fname, errno);
230237
free(mm);
231238
return NULL;
232239
}
233240

234241
if (fstat(mm->fd, &stat) == -1) {
235-
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n",
236-
errno);
237242
close(mm->fd);
238243
free(mm);
239244
return NULL;
@@ -242,8 +247,6 @@ void *new_mmap(char *fname) {
242247

243248
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
244249
if (mm->memmap == MAP_FAILED) {
245-
/* XXX Eventually remove this print statement. */
246-
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
247250
close(mm->fd);
248251
free(mm);
249252
return NULL;

0 commit comments

Comments
 (0)