Skip to content

Commit 07c9344

Browse files
authored
Merge pull request #191 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents 12d1b4b + e9c85a4 commit 07c9344

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+926
-481
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ asv_bench/env/
104104
asv_bench/html/
105105
asv_bench/results/
106106
asv_bench/pandas/
107+
test-data.xml
107108

108109
# Documentation generated files #
109110
#################################

asv_bench/benchmarks/io/style.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,19 @@ def setup(self, cols, rows):
1717

1818
def time_apply_render(self, cols, rows):
1919
self._style_apply()
20-
self.st.render()
20+
self.st._render_html()
2121

2222
def peakmem_apply_render(self, cols, rows):
2323
self._style_apply()
24-
self.st.render()
24+
self.st._render_html()
2525

2626
def time_classes_render(self, cols, rows):
2727
self._style_classes()
28-
self.st.render()
28+
self.st._render_html()
2929

3030
def peakmem_classes_render(self, cols, rows):
3131
self._style_classes()
32-
self.st.render()
32+
self.st._render_html()
3333

3434
def _style_apply(self):
3535
def _apply_func(s):

asv_bench/benchmarks/strings.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -213,13 +213,18 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
213213

214214
class Contains:
215215

216-
params = [True, False]
217-
param_names = ["regex"]
216+
params = (["str", "string", "arrow_string"], [True, False])
217+
param_names = ["dtype", "regex"]
218+
219+
def setup(self, dtype, regex):
220+
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
218221

219-
def setup(self, regex):
220-
self.s = Series(tm.makeStringIndex(10 ** 5))
222+
try:
223+
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
224+
except ImportError:
225+
raise NotImplementedError
221226

222-
def time_contains(self, regex):
227+
def time_contains(self, dtype, regex):
223228
self.s.str.contains("A", regex=regex)
224229

225230

doc/source/whatsnew/v1.2.5.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
18-
-
18+
- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
1919
-
2020

2121
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.3.0.rst

+5-2
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,7 @@ Deprecations
609609
- Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`)
610610
- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
611611
- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)
612-
- The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories` is deprecated and will be removed in a future version (:issue:`37643`)
612+
- The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories` is deprecated and will be removed in a future version (:issue:`37643`)
613613
- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
614614

615615
.. ---------------------------------------------------------------------------
@@ -705,8 +705,9 @@ Conversion
705705
- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
706706
- Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
707707
- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
708+
- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`)
708709
- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
709-
-
710+
- Bug in :meth:`qcut` raising error when taking ``Float64DType`` as input (:issue:`40730`)
710711

711712
Strings
712713
^^^^^^^
@@ -847,6 +848,8 @@ Groupby/resample/rolling
847848
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
848849
- Bug in :meth:`.GroupBy.rank` with nullable dtypes incorrectly raising ``TypeError`` (:issue:`41010`)
849850
- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
851+
- Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
852+
- Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
850853

851854
Reshaping
852855
^^^^^^^^^

pandas/_libs/src/ujson/python/objToJSON.c

+29-94
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ typedef struct __PdBlockContext {
8383
int ncols;
8484
int transpose;
8585

86-
int *cindices; // frame column -> block column map
8786
NpyArrContext **npyCtxts; // NpyArrContext for each column
8887
} PdBlockContext;
8988

@@ -294,7 +293,12 @@ static int is_simple_frame(PyObject *obj) {
294293
if (!mgr) {
295294
return 0;
296295
}
297-
int ret = (get_attr_length(mgr, "blocks") <= 1);
296+
int ret;
297+
if (PyObject_HasAttrString(mgr, "blocks")) {
298+
ret = (get_attr_length(mgr, "blocks") <= 1);
299+
} else {
300+
ret = 0;
301+
}
298302

299303
Py_DECREF(mgr);
300304
return ret;
@@ -656,16 +660,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
656660
}
657661

658662
void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
659-
PyObject *obj, *blocks, *block, *values, *tmp;
660-
PyArrayObject *locs;
663+
PyObject *obj, *values, *arrays, *array;
661664
PdBlockContext *blkCtxt;
662665
NpyArrContext *npyarr;
663666
Py_ssize_t i;
664-
NpyIter *iter;
665-
NpyIter_IterNextFunc *iternext;
666-
npy_int64 **dataptr;
667-
npy_int64 colIdx;
668-
npy_intp idx;
669667

670668
obj = (PyObject *)_obj;
671669

@@ -687,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
687685

688686
if (blkCtxt->ncols == 0) {
689687
blkCtxt->npyCtxts = NULL;
690-
blkCtxt->cindices = NULL;
691688

692689
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
693690
return;
@@ -701,104 +698,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
701698
return;
702699
}
703700

704-
blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols);
705-
if (!blkCtxt->cindices) {
706-
PyErr_NoMemory();
707-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
708-
return;
709-
}
710-
711-
blocks = get_sub_attr(obj, "_mgr", "blocks");
712-
if (!blocks) {
701+
arrays = get_sub_attr(obj, "_mgr", "column_arrays");
702+
if (!arrays) {
713703
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
714704
return;
715-
} else if (!PyTuple_Check(blocks)) {
716-
PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!");
717-
goto BLKRET;
718705
}
719706

720-
// force transpose so each NpyArrContext strides down its column
721-
GET_TC(tc)->transpose = 1;
722-
723-
for (i = 0; i < PyObject_Length(blocks); i++) {
724-
block = PyTuple_GET_ITEM(blocks, i);
725-
if (!block) {
707+
for (i = 0; i < PyObject_Length(arrays); i++) {
708+
array = PyList_GET_ITEM(arrays, i);
709+
if (!array) {
726710
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
727-
goto BLKRET;
711+
goto ARR_RET;
728712
}
729713

730-
tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL);
731-
if (!tmp) {
714+
// ensure we have a numpy array (i.e. np.asarray)
715+
values = PyObject_CallMethod(array, "__array__", NULL);
716+
if ((!values) || (!PyArray_CheckExact(values))) {
717+
// Didn't get a numpy array
732718
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
733719
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
734-
goto BLKRET;
735-
}
736-
737-
values = PyArray_Transpose((PyArrayObject *)tmp, NULL);
738-
Py_DECREF(tmp);
739-
if (!values) {
740-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
741-
goto BLKRET;
742-
}
743-
744-
locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array");
745-
if (!locs) {
746-
Py_DECREF(values);
747-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
748-
goto BLKRET;
720+
goto ARR_RET;
749721
}
750722

751-
iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER,
752-
NPY_NO_CASTING, NULL);
753-
if (!iter) {
754-
Py_DECREF(values);
755-
Py_DECREF(locs);
756-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
757-
goto BLKRET;
758-
}
759-
iternext = NpyIter_GetIterNext(iter, NULL);
760-
if (!iternext) {
761-
NpyIter_Deallocate(iter);
762-
Py_DECREF(values);
763-
Py_DECREF(locs);
764-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
765-
goto BLKRET;
766-
}
767-
dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter);
768-
do {
769-
colIdx = **dataptr;
770-
idx = NpyIter_GetIterIndex(iter);
723+
GET_TC(tc)->newObj = values;
771724

772-
blkCtxt->cindices[colIdx] = idx;
725+
// init a dedicated context for this column
726+
NpyArr_iterBegin(obj, tc);
727+
npyarr = GET_TC(tc)->npyarr;
773728

774-
// Reference freed in Pdblock_iterend
775-
Py_INCREF(values);
776-
GET_TC(tc)->newObj = values;
777-
778-
// init a dedicated context for this column
779-
NpyArr_iterBegin(obj, tc);
780-
npyarr = GET_TC(tc)->npyarr;
781-
782-
// set the dataptr to our desired column and initialise
783-
if (npyarr != NULL) {
784-
npyarr->dataptr += npyarr->stride * idx;
785-
NpyArr_iterNext(obj, tc);
786-
}
787-
GET_TC(tc)->itemValue = NULL;
788-
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
789-
790-
blkCtxt->npyCtxts[colIdx] = npyarr;
791-
GET_TC(tc)->newObj = NULL;
792-
} while (iternext(iter));
729+
GET_TC(tc)->itemValue = NULL;
730+
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
793731

794-
NpyIter_Deallocate(iter);
795-
Py_DECREF(values);
796-
Py_DECREF(locs);
732+
blkCtxt->npyCtxts[i] = npyarr;
733+
GET_TC(tc)->newObj = NULL;
797734
}
798735
GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
736+
goto ARR_RET;
799737

800-
BLKRET:
801-
Py_DECREF(blocks);
738+
ARR_RET:
739+
Py_DECREF(arrays);
802740
}
803741

804742
void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
@@ -830,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
830768
if (blkCtxt->npyCtxts) {
831769
PyObject_Free(blkCtxt->npyCtxts);
832770
}
833-
if (blkCtxt->cindices) {
834-
PyObject_Free(blkCtxt->cindices);
835-
}
836771
PyObject_Free(blkCtxt);
837772
}
838773
}

0 commit comments

Comments
 (0)