Skip to content

Commit dc7b704

Browse files
committed
Merge branch 'master' of github.com:pandas-dev/pandas into min_max_sparse
2 parents 0e6a384 + 0158382 commit dc7b704

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+1443
-817
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ asv_bench/env/
104104
asv_bench/html/
105105
asv_bench/results/
106106
asv_bench/pandas/
107+
test-data.xml
107108

108109
# Documentation generated files #
109110
#################################

asv_bench/benchmarks/io/style.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,19 @@ def setup(self, cols, rows):
1717

1818
def time_apply_render(self, cols, rows):
1919
self._style_apply()
20-
self.st.render()
20+
self.st._render_html()
2121

2222
def peakmem_apply_render(self, cols, rows):
2323
self._style_apply()
24-
self.st.render()
24+
self.st._render_html()
2525

2626
def time_classes_render(self, cols, rows):
2727
self._style_classes()
28-
self.st.render()
28+
self.st._render_html()
2929

3030
def peakmem_classes_render(self, cols, rows):
3131
self._style_classes()
32-
self.st.render()
32+
self.st._render_html()
3333

3434
def _style_apply(self):
3535
def _apply_func(s):

asv_bench/benchmarks/strings.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -213,13 +213,18 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
213213

214214
class Contains:
215215

216-
params = [True, False]
217-
param_names = ["regex"]
216+
params = (["str", "string", "arrow_string"], [True, False])
217+
param_names = ["dtype", "regex"]
218+
219+
def setup(self, dtype, regex):
220+
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
218221

219-
def setup(self, regex):
220-
self.s = Series(tm.makeStringIndex(10 ** 5))
222+
try:
223+
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
224+
except ImportError:
225+
raise NotImplementedError
221226

222-
def time_contains(self, regex):
227+
def time_contains(self, dtype, regex):
223228
self.s.str.contains("A", regex=regex)
224229

225230

ci/code_checks.sh

Lines changed: 28 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -106,84 +106,34 @@ fi
106106
### DOCTESTS ###
107107
if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
108108

109-
# Individual files
110-
111-
MSG='Doctests accessor.py' ; echo $MSG
112-
pytest -q --doctest-modules pandas/core/accessor.py
113-
RET=$(($RET + $?)) ; echo $MSG "DONE"
114-
115-
MSG='Doctests aggregation.py' ; echo $MSG
116-
pytest -q --doctest-modules pandas/core/aggregation.py
117-
RET=$(($RET + $?)) ; echo $MSG "DONE"
118-
119-
MSG='Doctests base.py' ; echo $MSG
120-
pytest -q --doctest-modules pandas/core/base.py
121-
RET=$(($RET + $?)) ; echo $MSG "DONE"
122-
123-
MSG='Doctests construction.py' ; echo $MSG
124-
pytest -q --doctest-modules pandas/core/construction.py
125-
RET=$(($RET + $?)) ; echo $MSG "DONE"
126-
127-
MSG='Doctests frame.py' ; echo $MSG
128-
pytest -q --doctest-modules pandas/core/frame.py
129-
RET=$(($RET + $?)) ; echo $MSG "DONE"
130-
131-
MSG='Doctests generic.py' ; echo $MSG
132-
pytest -q --doctest-modules pandas/core/generic.py
133-
RET=$(($RET + $?)) ; echo $MSG "DONE"
134-
135-
MSG='Doctests series.py' ; echo $MSG
136-
pytest -q --doctest-modules pandas/core/series.py
137-
RET=$(($RET + $?)) ; echo $MSG "DONE"
138-
139-
MSG='Doctests strings.py' ; echo $MSG
140-
pytest -q --doctest-modules pandas/core/strings/
141-
RET=$(($RET + $?)) ; echo $MSG "DONE"
142-
143-
MSG='Doctests sql.py' ; echo $MSG
144-
pytest -q --doctest-modules pandas/io/sql.py
145-
RET=$(($RET + $?)) ; echo $MSG "DONE"
146-
147-
# Directories
148-
149-
MSG='Doctests arrays'; echo $MSG
150-
pytest -q --doctest-modules pandas/core/arrays/
151-
RET=$(($RET + $?)) ; echo $MSG "DONE"
152-
153-
MSG='Doctests computation' ; echo $MSG
154-
pytest -q --doctest-modules pandas/core/computation/
155-
RET=$(($RET + $?)) ; echo $MSG "DONE"
156-
157-
MSG='Doctests dtypes'; echo $MSG
158-
pytest -q --doctest-modules pandas/core/dtypes/
159-
RET=$(($RET + $?)) ; echo $MSG "DONE"
160-
161-
MSG='Doctests groupby' ; echo $MSG
162-
pytest -q --doctest-modules pandas/core/groupby/
163-
RET=$(($RET + $?)) ; echo $MSG "DONE"
164-
165-
MSG='Doctests indexes' ; echo $MSG
166-
pytest -q --doctest-modules pandas/core/indexes/
167-
RET=$(($RET + $?)) ; echo $MSG "DONE"
168-
169-
MSG='Doctests ops' ; echo $MSG
170-
pytest -q --doctest-modules pandas/core/ops/
171-
RET=$(($RET + $?)) ; echo $MSG "DONE"
172-
173-
MSG='Doctests reshape' ; echo $MSG
174-
pytest -q --doctest-modules pandas/core/reshape/
175-
RET=$(($RET + $?)) ; echo $MSG "DONE"
176-
177-
MSG='Doctests tools' ; echo $MSG
178-
pytest -q --doctest-modules pandas/core/tools/
179-
RET=$(($RET + $?)) ; echo $MSG "DONE"
180-
181-
MSG='Doctests window' ; echo $MSG
182-
pytest -q --doctest-modules pandas/core/window/
183-
RET=$(($RET + $?)) ; echo $MSG "DONE"
184-
185-
MSG='Doctests tseries' ; echo $MSG
186-
pytest -q --doctest-modules pandas/tseries/
109+
MSG='Doctests for individual files' ; echo $MSG
110+
pytest -q --doctest-modules \
111+
pandas/core/accessor.py \
112+
pandas/core/aggregation.py \
113+
pandas/core/algorithms.py \
114+
pandas/core/base.py \
115+
pandas/core/construction.py \
116+
pandas/core/frame.py \
117+
pandas/core/generic.py \
118+
pandas/core/indexers.py \
119+
pandas/core/nanops.py \
120+
pandas/core/series.py \
121+
pandas/io/sql.py
122+
RET=$(($RET + $?)) ; echo $MSG "DONE"
123+
124+
MSG='Doctests for directories' ; echo $MSG
125+
pytest -q --doctest-modules \
126+
pandas/core/arrays/ \
127+
pandas/core/computation/ \
128+
pandas/core/dtypes/ \
129+
pandas/core/groupby/ \
130+
pandas/core/indexes/ \
131+
pandas/core/ops/ \
132+
pandas/core/reshape/ \
133+
pandas/core/strings/ \
134+
pandas/core/tools/ \
135+
pandas/core/window/ \
136+
pandas/tseries/
187137
RET=$(($RET + $?)) ; echo $MSG "DONE"
188138

189139
fi

doc/source/whatsnew/v1.2.5.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
18-
-
18+
- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
1919
-
2020

2121
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.3.0.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,7 @@ Deprecations
609609
- Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`)
610610
- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
611611
- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)
612-
- The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories` is deprecated and will be removed in a future version (:issue:`37643`)
612+
- The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories` is deprecated and will be removed in a future version (:issue:`37643`)
613613
- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`)
614614

615615
.. ---------------------------------------------------------------------------
@@ -705,8 +705,9 @@ Conversion
705705
- Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
706706
- Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
707707
- Bug in :meth:`StringArray.astype` falling back to numpy and raising when converting to ``dtype='categorical'`` (:issue:`40450`)
708+
- Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`)
708709
- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
709-
-
710+
- Bug in :meth:`qcut` raising error when taking ``Float64DType`` as input (:issue:`40730`)
710711

711712
Strings
712713
^^^^^^^
@@ -847,6 +848,8 @@ Groupby/resample/rolling
847848
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
848849
- Bug in :meth:`.GroupBy.rank` with nullable dtypes incorrectly raising ``TypeError`` (:issue:`41010`)
849850
- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
851+
- Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
852+
- Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
850853

851854
Reshaping
852855
^^^^^^^^^

pandas/_libs/src/ujson/python/objToJSON.c

Lines changed: 29 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ typedef struct __PdBlockContext {
8383
int ncols;
8484
int transpose;
8585

86-
int *cindices; // frame column -> block column map
8786
NpyArrContext **npyCtxts; // NpyArrContext for each column
8887
} PdBlockContext;
8988

@@ -294,7 +293,12 @@ static int is_simple_frame(PyObject *obj) {
294293
if (!mgr) {
295294
return 0;
296295
}
297-
int ret = (get_attr_length(mgr, "blocks") <= 1);
296+
int ret;
297+
if (PyObject_HasAttrString(mgr, "blocks")) {
298+
ret = (get_attr_length(mgr, "blocks") <= 1);
299+
} else {
300+
ret = 0;
301+
}
298302

299303
Py_DECREF(mgr);
300304
return ret;
@@ -656,16 +660,10 @@ void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
656660
}
657661

658662
void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
659-
PyObject *obj, *blocks, *block, *values, *tmp;
660-
PyArrayObject *locs;
663+
PyObject *obj, *values, *arrays, *array;
661664
PdBlockContext *blkCtxt;
662665
NpyArrContext *npyarr;
663666
Py_ssize_t i;
664-
NpyIter *iter;
665-
NpyIter_IterNextFunc *iternext;
666-
npy_int64 **dataptr;
667-
npy_int64 colIdx;
668-
npy_intp idx;
669667

670668
obj = (PyObject *)_obj;
671669

@@ -687,7 +685,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
687685

688686
if (blkCtxt->ncols == 0) {
689687
blkCtxt->npyCtxts = NULL;
690-
blkCtxt->cindices = NULL;
691688

692689
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
693690
return;
@@ -701,104 +698,45 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
701698
return;
702699
}
703700

704-
blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols);
705-
if (!blkCtxt->cindices) {
706-
PyErr_NoMemory();
707-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
708-
return;
709-
}
710-
711-
blocks = get_sub_attr(obj, "_mgr", "blocks");
712-
if (!blocks) {
701+
arrays = get_sub_attr(obj, "_mgr", "column_arrays");
702+
if (!arrays) {
713703
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
714704
return;
715-
} else if (!PyTuple_Check(blocks)) {
716-
PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!");
717-
goto BLKRET;
718705
}
719706

720-
// force transpose so each NpyArrContext strides down its column
721-
GET_TC(tc)->transpose = 1;
722-
723-
for (i = 0; i < PyObject_Length(blocks); i++) {
724-
block = PyTuple_GET_ITEM(blocks, i);
725-
if (!block) {
707+
for (i = 0; i < PyObject_Length(arrays); i++) {
708+
array = PyList_GET_ITEM(arrays, i);
709+
if (!array) {
726710
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
727-
goto BLKRET;
711+
goto ARR_RET;
728712
}
729713

730-
tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL);
731-
if (!tmp) {
714+
// ensure we have a numpy array (i.e. np.asarray)
715+
values = PyObject_CallMethod(array, "__array__", NULL);
716+
if ((!values) || (!PyArray_CheckExact(values))) {
717+
// Didn't get a numpy array
732718
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
733719
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
734-
goto BLKRET;
735-
}
736-
737-
values = PyArray_Transpose((PyArrayObject *)tmp, NULL);
738-
Py_DECREF(tmp);
739-
if (!values) {
740-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
741-
goto BLKRET;
742-
}
743-
744-
locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array");
745-
if (!locs) {
746-
Py_DECREF(values);
747-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
748-
goto BLKRET;
720+
goto ARR_RET;
749721
}
750722

751-
iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER,
752-
NPY_NO_CASTING, NULL);
753-
if (!iter) {
754-
Py_DECREF(values);
755-
Py_DECREF(locs);
756-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
757-
goto BLKRET;
758-
}
759-
iternext = NpyIter_GetIterNext(iter, NULL);
760-
if (!iternext) {
761-
NpyIter_Deallocate(iter);
762-
Py_DECREF(values);
763-
Py_DECREF(locs);
764-
GET_TC(tc)->iterNext = NpyArr_iterNextNone;
765-
goto BLKRET;
766-
}
767-
dataptr = (npy_int64 **)NpyIter_GetDataPtrArray(iter);
768-
do {
769-
colIdx = **dataptr;
770-
idx = NpyIter_GetIterIndex(iter);
723+
GET_TC(tc)->newObj = values;
771724

772-
blkCtxt->cindices[colIdx] = idx;
725+
// init a dedicated context for this column
726+
NpyArr_iterBegin(obj, tc);
727+
npyarr = GET_TC(tc)->npyarr;
773728

774-
// Reference freed in Pdblock_iterend
775-
Py_INCREF(values);
776-
GET_TC(tc)->newObj = values;
777-
778-
// init a dedicated context for this column
779-
NpyArr_iterBegin(obj, tc);
780-
npyarr = GET_TC(tc)->npyarr;
781-
782-
// set the dataptr to our desired column and initialise
783-
if (npyarr != NULL) {
784-
npyarr->dataptr += npyarr->stride * idx;
785-
NpyArr_iterNext(obj, tc);
786-
}
787-
GET_TC(tc)->itemValue = NULL;
788-
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
789-
790-
blkCtxt->npyCtxts[colIdx] = npyarr;
791-
GET_TC(tc)->newObj = NULL;
792-
} while (iternext(iter));
729+
GET_TC(tc)->itemValue = NULL;
730+
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
793731

794-
NpyIter_Deallocate(iter);
795-
Py_DECREF(values);
796-
Py_DECREF(locs);
732+
blkCtxt->npyCtxts[i] = npyarr;
733+
GET_TC(tc)->newObj = NULL;
797734
}
798735
GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
736+
goto ARR_RET;
799737

800-
BLKRET:
801-
Py_DECREF(blocks);
738+
ARR_RET:
739+
Py_DECREF(arrays);
802740
}
803741

804742
void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
@@ -830,9 +768,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
830768
if (blkCtxt->npyCtxts) {
831769
PyObject_Free(blkCtxt->npyCtxts);
832770
}
833-
if (blkCtxt->cindices) {
834-
PyObject_Free(blkCtxt->cindices);
835-
}
836771
PyObject_Free(blkCtxt);
837772
}
838773
}

0 commit comments

Comments
 (0)