Skip to content

Commit 4ea2e80

Browse files
Merge remote-tracking branch 'upstream/master' into groupby-select-list-fix
2 parents 991e54c + 9871bdd commit 4ea2e80

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+1108
-698
lines changed

asv_bench/benchmarks/io/json.py

+24
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,30 @@ def peakmem_to_json_wide(self, orient, frame):
132132
df.to_json(self.fname, orient=orient)
133133

134134

135+
class ToJSONISO(BaseIO):
136+
fname = "__test__.json"
137+
params = [["split", "columns", "index", "values", "records"]]
138+
param_names = ["orient"]
139+
140+
def setup(self, orient):
141+
N = 10 ** 5
142+
index = date_range("20000101", periods=N, freq="H")
143+
timedeltas = timedelta_range(start=1, periods=N, freq="s")
144+
datetimes = date_range(start=1, periods=N, freq="s")
145+
self.df = DataFrame(
146+
{
147+
"td_1": timedeltas,
148+
"td_2": timedeltas,
149+
"ts_1": datetimes,
150+
"ts_2": datetimes,
151+
},
152+
index=index,
153+
)
154+
155+
def time_iso_format(self, orient):
156+
self.df.to_json(orient=orient, date_format="iso")
157+
158+
135159
class ToJSONLines(BaseIO):
136160

137161
fname = "__test__.json"

ci/azure/posix.yml

+21-7
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,24 @@ jobs:
1919
ENV_FILE: ci/deps/azure-36-minimum_versions.yaml
2020
CONDA_PY: "36"
2121
PATTERN: "not slow and not network"
22+
2223
py36_locale_slow_old_np:
2324
ENV_FILE: ci/deps/azure-36-locale_slow.yaml
2425
CONDA_PY: "36"
2526
PATTERN: "slow"
26-
LOCALE_OVERRIDE: "zh_CN.UTF-8"
27+
# pandas does not use the language (zh_CN), but should support diferent encodings (utf8)
28+
# we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any
29+
LANG: "zh_CN.utf8"
30+
LC_ALL: "zh_CN.utf8"
2731
EXTRA_APT: "language-pack-zh-hans"
2832

2933
py36_locale:
3034
ENV_FILE: ci/deps/azure-36-locale.yaml
3135
CONDA_PY: "36"
3236
PATTERN: "not slow and not network"
33-
LOCALE_OVERRIDE: "it_IT.UTF-8"
37+
LANG: "it_IT.utf8"
38+
LC_ALL: "it_IT.utf8"
39+
EXTRA_APT: "language-pack-it"
3440

3541
py36_32bit:
3642
ENV_FILE: ci/deps/azure-36-32bit.yaml
@@ -42,7 +48,9 @@ jobs:
4248
ENV_FILE: ci/deps/azure-37-locale.yaml
4349
CONDA_PY: "37"
4450
PATTERN: "not slow and not network"
45-
LOCALE_OVERRIDE: "zh_CN.UTF-8"
51+
LANG: "zh_CN.utf8"
52+
LC_ALL: "zh_CN.utf8"
53+
EXTRA_APT: "language-pack-zh-hans"
4654

4755
py37_np_dev:
4856
ENV_FILE: ci/deps/azure-37-numpydev.yaml
@@ -54,10 +62,16 @@ jobs:
5462

5563
steps:
5664
- script: |
57-
if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi
58-
echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
59-
echo "Creating Environment"
60-
ci/setup_env.sh
65+
if [ "$(uname)" == "Linux" ]; then
66+
sudo apt-get update
67+
sudo apt-get install -y libc6-dev-i386 $EXTRA_APT
68+
fi
69+
displayName: 'Install extra packages'
70+
71+
- script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin'
72+
displayName: 'Set conda path'
73+
74+
- script: ci/setup_env.sh
6175
displayName: 'Setup environment and build pandas'
6276

6377
- script: |

ci/code_checks.sh

+8
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
100100
cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp
101101
RET=$(($RET + $?)) ; echo $MSG "DONE"
102102

103+
MSG='Check for use of not concatenated strings' ; echo $MSG
104+
if [[ "$GITHUB_ACTIONS" == "true" ]]; then
105+
$BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:{msg}" .
106+
else
107+
$BASE_DIR/scripts/validate_string_concatenation.py .
108+
fi
109+
RET=$(($RET + $?)) ; echo $MSG "DONE"
110+
103111
echo "isort --version-number"
104112
isort --version-number
105113

ci/run_tests.sh

-11
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,6 @@
55
# https://github.com/pytest-dev/pytest/issues/1075
66
export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))')
77

8-
if [ -n "$LOCALE_OVERRIDE" ]; then
9-
export LC_ALL="$LOCALE_OVERRIDE"
10-
export LANG="$LOCALE_OVERRIDE"
11-
PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'`
12-
if [[ "$LOCALE_OVERRIDE" != "$PANDAS_LOCALE" ]]; then
13-
echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE"
14-
# TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed
15-
# exit 1
16-
fi
17-
fi
18-
198
if [[ "not network" == *"$PATTERN"* ]]; then
209
export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4;
2110
fi

ci/setup_env.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
#!/bin/bash -e
22

33
# edit the locale file if needed
4-
if [ -n "$LOCALE_OVERRIDE" ]; then
4+
if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then
55
echo "Adding locale to the first line of pandas/__init__.py"
66
rm -f pandas/__init__.pyc
7-
SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n"
7+
SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n"
88
sed -i "$SEDC" pandas/__init__.py
9+
910
echo "[head -4 pandas/__init__.py]"
1011
head -4 pandas/__init__.py
1112
echo
12-
sudo locale-gen "$LOCALE_OVERRIDE"
1313
fi
1414

1515
MINICONDA_DIR="$HOME/miniconda3"

doc/source/whatsnew/v1.0.0.rst

+10-3
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ Dedicated string data type
5656
^^^^^^^^^^^^^^^^^^^^^^^^^^
5757

5858
We've added :class:`StringDtype`, an extension type dedicated to string data.
59-
Previously, strings were typically stored in object-dtype NumPy arrays.
59+
Previously, strings were typically stored in object-dtype NumPy arrays. (:issue:`29975`)
6060

6161
.. warning::
6262

@@ -216,13 +216,15 @@ Other enhancements
216216
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
217217
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
218218
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
219+
- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`)
219220
- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
220221
- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`)
221222
- DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`)
222223
- :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`)
223224
- :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` added (:issue:`11052`)
224225
- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`)
225226
- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`)
227+
- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`)
226228

227229
Build Changes
228230
^^^^^^^^^^^^^
@@ -812,6 +814,7 @@ Datetimelike
812814
- Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`)
813815
- Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`)
814816
- Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`)
817+
- Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`)
815818
- Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`)
816819
- Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`)
817820
- Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`)
@@ -924,6 +927,7 @@ I/O
924927
- Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`)
925928
- Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`)
926929
- :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`)
930+
- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`)
927931

928932
Plotting
929933
^^^^^^^^
@@ -945,7 +949,7 @@ Plotting
945949
Groupby/resample/rolling
946950
^^^^^^^^^^^^^^^^^^^^^^^^
947951

948-
-
952+
- Bug in :meth:`DataFrame.groupby.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`)
949953
- Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`)
950954
- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty series or dataframe (:issue:`28427`)
951955
- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`)
@@ -975,6 +979,7 @@ Reshaping
975979
- :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`)
976980
- Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`).
977981
- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`)
982+
- Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`)
978983
- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`)
979984
- Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`)
980985
- Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`)
@@ -1013,7 +1018,9 @@ Other
10131018
- Fixed ``pow`` operations for :class:`IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`)
10141019
- Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`)
10151020
- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`)
1016-
1021+
- Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`)
1022+
- Bug in :meth:`DaataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`)
1023+
- Bug where :meth:`DataFrame.itertuples` would incorrectly determine whether or not namedtuples could be used for dataframes of 255 columns (:issue:`28282`)
10171024

10181025
.. _whatsnew_1000.contributors:
10191026

pandas/_libs/intervaltree.pxi.in

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,11 @@ from pandas._libs.algos import is_monotonic
88

99
ctypedef fused int_scalar_t:
1010
int64_t
11-
int32_t
1211
float64_t
13-
float32_t
1412

1513
ctypedef fused uint_scalar_t:
1614
uint64_t
1715
float64_t
18-
float32_t
1916

2017
ctypedef fused scalar_t:
2118
int_scalar_t
@@ -212,7 +209,7 @@ cdef sort_values_and_indices(all_values, all_indices, subset):
212209
{{py:
213210

214211
nodes = []
215-
for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']:
212+
for dtype in ['float64', 'int64', 'uint64']:
216213
for closed, cmp_left, cmp_right in [
217214
('left', '<=', '<'),
218215
('right', '<', '<='),

pandas/_libs/parsers.pyx

+9-27
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,9 @@ cdef extern from "parser/tokenizer.h":
171171
int64_t skip_first_N_rows
172172
int64_t skipfooter
173173
# pick one, depending on whether the converter requires GIL
174-
float64_t (*double_converter_nogil)(const char *, char **,
175-
char, char, char,
176-
int, int *, int *) nogil
177-
float64_t (*double_converter_withgil)(const char *, char **,
178-
char, char, char,
179-
int, int *, int *)
174+
float64_t (*double_converter)(const char *, char **,
175+
char, char, char,
176+
int, int *, int *) nogil
180177

181178
# error handling
182179
char *warn_msg
@@ -469,16 +466,11 @@ cdef class TextReader:
469466

470467
if float_precision == "round_trip":
471468
# see gh-15140
472-
#
473-
# Our current roundtrip implementation requires the GIL.
474-
self.parser.double_converter_nogil = NULL
475-
self.parser.double_converter_withgil = round_trip
469+
self.parser.double_converter = round_trip
476470
elif float_precision == "high":
477-
self.parser.double_converter_withgil = NULL
478-
self.parser.double_converter_nogil = precise_xstrtod
471+
self.parser.double_converter = precise_xstrtod
479472
else:
480-
self.parser.double_converter_withgil = NULL
481-
self.parser.double_converter_nogil = xstrtod
473+
self.parser.double_converter = xstrtod
482474

483475
if isinstance(dtype, dict):
484476
dtype = {k: pandas_dtype(dtype[k])
@@ -1663,22 +1655,12 @@ cdef _try_double(parser_t *parser, int64_t col,
16631655
result = np.empty(lines, dtype=np.float64)
16641656
data = <float64_t *>result.data
16651657
na_fset = kset_float64_from_list(na_flist)
1666-
if parser.double_converter_nogil != NULL: # if it can run without the GIL
1667-
with nogil:
1668-
error = _try_double_nogil(parser, parser.double_converter_nogil,
1669-
col, line_start, line_end,
1670-
na_filter, na_hashset, use_na_flist,
1671-
na_fset, NA, data, &na_count)
1672-
else:
1673-
assert parser.double_converter_withgil != NULL
1674-
error = _try_double_nogil(parser,
1675-
<float64_t (*)(const char *, char **,
1676-
char, char, char,
1677-
int, int *, int *)
1678-
nogil>parser.double_converter_withgil,
1658+
with nogil:
1659+
error = _try_double_nogil(parser, parser.double_converter,
16791660
col, line_start, line_end,
16801661
na_filter, na_hashset, use_na_flist,
16811662
na_fset, NA, data, &na_count)
1663+
16821664
kh_destroy_float64(na_fset)
16831665
if error != 0:
16841666
return None, None

pandas/_libs/reduction.pyx

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from copy import copy
12
from distutils.version import LooseVersion
23

34
from cython import Py_ssize_t
@@ -15,7 +16,7 @@ from numpy cimport (ndarray,
1516
cnp.import_array()
1617

1718
cimport pandas._libs.util as util
18-
from pandas._libs.lib import maybe_convert_objects
19+
from pandas._libs.lib import maybe_convert_objects, is_scalar
1920

2021

2122
cdef _check_result_array(object obj, Py_ssize_t cnt):
@@ -492,14 +493,19 @@ def apply_frame_axis0(object frame, object f, object names,
492493
# Need to infer if low level index slider will cause segfaults
493494
require_slow_apply = i == 0 and piece is chunk
494495
try:
495-
if piece.index is chunk.index:
496-
piece = piece.copy(deep='all')
497-
else:
496+
if piece.index is not chunk.index:
498497
mutated = True
499498
except AttributeError:
500499
# `piece` might not have an index, could be e.g. an int
501500
pass
502501

502+
if not is_scalar(piece):
503+
# Need to copy data to avoid appending references
504+
if hasattr(piece, "copy"):
505+
piece = piece.copy(deep="all")
506+
else:
507+
piece = copy(piece)
508+
503509
results.append(piece)
504510

505511
# If the data was modified inplace we need to

pandas/_libs/src/parser/tokenizer.c

+7
Original file line numberDiff line numberDiff line change
@@ -1774,11 +1774,18 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17741774

17751775
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17761776
int skip_trailing, int *error, int *maybe_int) {
1777+
// This is called from a nogil block in parsers.pyx
1778+
// so need to explicitly get GIL before Python calls
1779+
PyGILState_STATE gstate;
1780+
gstate = PyGILState_Ensure();
1781+
17771782
double r = PyOS_string_to_double(p, q, 0);
17781783
if (maybe_int != NULL) *maybe_int = 0;
17791784
if (PyErr_Occurred() != NULL) *error = -1;
17801785
else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
17811786
PyErr_Clear();
1787+
1788+
PyGILState_Release(gstate);
17821789
return r;
17831790
}
17841791

pandas/_libs/src/parser/tokenizer.h

+4-5
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,8 @@ typedef struct parser_t {
155155
PyObject *skipfunc;
156156
int64_t skip_first_N_rows;
157157
int64_t skip_footer;
158-
// pick one, depending on whether the converter requires GIL
159-
double (*double_converter_nogil)(const char *, char **,
160-
char, char, char, int, int *, int *);
161-
double (*double_converter_withgil)(const char *, char **,
162-
char, char, char, int, int *, int *);
158+
double (*double_converter)(const char *, char **,
159+
char, char, char, int, int *, int *);
163160

164161
// error handling
165162
char *warn_msg;
@@ -226,6 +223,8 @@ double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
226223
double precise_xstrtod(const char *p, char **q, char decimal,
227224
char sci, char tsep, int skip_trailing,
228225
int *error, int *maybe_int);
226+
227+
// GH-15140 - round_trip requires and acquires the GIL on its own
229228
double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
230229
int skip_trailing, int *error, int *maybe_int);
231230
int to_boolean(const char *item, uint8_t *val);

pandas/_libs/src/ujson/lib/ultrajson.h

+4
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ enum JSTYPES {
154154
JT_ARRAY, // Array structure
155155
JT_OBJECT, // Key/Value structure
156156
JT_INVALID, // Internal, do not return nor expect
157+
JT_POS_INF, // Positive infinity
158+
JT_NEG_INF, // Negative infinity
157159
};
158160

159161
typedef void * JSOBJ;
@@ -290,6 +292,8 @@ typedef struct __JSONObjectDecoder {
290292
JSOBJ (*newTrue)(void *prv);
291293
JSOBJ (*newFalse)(void *prv);
292294
JSOBJ (*newNull)(void *prv);
295+
JSOBJ (*newPosInf)(void *prv);
296+
JSOBJ (*newNegInf)(void *prv);
293297
JSOBJ (*newObject)(void *prv, void *decoder);
294298
JSOBJ (*endObject)(void *prv, JSOBJ obj);
295299
JSOBJ (*newArray)(void *prv, void *decoder);

0 commit comments

Comments
 (0)