diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d6744f578560c..ca0c75f9de94f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,7 +65,7 @@ jobs:
if: always()
- name: Testing docstring validation script
- run: pytest --capture=no --strict-markers scripts
+ run: pytest scripts
if: always()
- name: Running benchmarks
diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml
index ba5a0a1fd0909..a5aef7825c770 100644
--- a/.github/workflows/database.yml
+++ b/.github/workflows/database.yml
@@ -78,7 +78,7 @@ jobs:
uses: ./.github/actions/build_pandas
- name: Test
- run: pytest -m "${{ env.PATTERN }}" -n 2 --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml -s --cov=pandas --cov-report=xml pandas/tests/io
+ run: pytest -m "${{ env.PATTERN }}" -n 2 --dist=loadfile --cov=pandas --cov-report=xml pandas/tests/io
if: always()
- name: Build Version
diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml
new file mode 100644
index 0000000000000..2643dc5ec656e
--- /dev/null
+++ b/.github/workflows/python-dev.yml
@@ -0,0 +1,70 @@
+name: Python Dev
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ name: actions-310-dev
+ timeout-minutes: 60
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Set up Python Dev Version
+ uses: actions/setup-python@v2
+ with:
+ python-version: '3.10-dev'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip setuptools wheel
+ pip install git+https://github.com/numpy/numpy.git
+ pip install git+https://github.com/pytest-dev/pytest.git
+ pip install git+https://github.com/nedbat/coveragepy.git
+ pip install cython python-dateutil pytz hypothesis pytest-xdist
+ pip list
+
+ - name: Build Pandas
+ run: |
+ python setup.py build_ext -q -j2
+ python -m pip install -e . --no-build-isolation --no-use-pep517
+
+ - name: Build Version
+ run: |
+ python -c "import pandas; pandas.show_versions();"
+
+ - name: Test with pytest
+ run: |
+ coverage run -m pytest -m 'not slow and not network and not clipboard' pandas
+ continue-on-error: true
+
+ - name: Publish test results
+ uses: actions/upload-artifact@master
+ with:
+ name: Test results
+ path: test-data.xml
+ if: failure()
+
+ - name: Print skipped tests
+ run: |
+ python ci/print_skipped.py
+
+ - name: Report Coverage
+ run: |
+ coverage report -m
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v1
+ with:
+ flags: unittests
+ name: codecov-pandas
+ fail_ci_if_error: true
diff --git a/.gitignore b/.gitignore
index b682d93efbd04..2c337be60e94e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,6 +104,7 @@ asv_bench/env/
asv_bench/html/
asv_bench/results/
asv_bench/pandas/
+test-data.xml
# Documentation generated files #
#################################
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2f46190ef5eb7..5b11490479088 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
exclude: ^pandas/_libs/src/(klib|headers)/
args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir']
- repo: https://gitlab.com/pycqa/flake8
- rev: 3.9.1
+ rev: 3.9.0
hooks:
- id: flake8
additional_dependencies:
@@ -75,7 +75,7 @@ repos:
hooks:
- id: yesqa
additional_dependencies:
- - flake8==3.9.1
+ - flake8==3.9.0
- flake8-comprehensions==3.1.0
- flake8-bugbear==21.3.2
- pandas-dev-flaker==0.2.0
diff --git a/LICENSE b/LICENSE
index 76954a5a339ab..a0cc369f725b8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -3,7 +3,7 @@ BSD 3-Clause License
Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.
-Copyright (c) 2011-2020, Open source contributors.
+Copyright (c) 2011-2021, Open source contributors.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 65167e6467fd5..760da36a30075 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -563,6 +563,14 @@ def time_frame_nunique(self):
self.df.nunique()
+class SeriesNuniqueWithNan:
+ def setup(self):
+ self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float)
+
+ def time_series_nunique_nan(self):
+ self.ser.nunique()
+
+
class Duplicated:
def setup(self):
n = 1 << 20
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index b4b20553ec460..27761ccd0d917 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -505,6 +505,34 @@ def time_frame_agg(self, dtype, method):
self.df.groupby("key").agg(method)
+class CumminMax:
+ param_names = ["dtype", "method"]
+ params = [
+ ["float64", "int64", "Float64", "Int64"],
+ ["cummin", "cummax"],
+ ]
+
+ def setup(self, dtype, method):
+ N = 500_000
+ vals = np.random.randint(-10, 10, (N, 5))
+ null_vals = vals.astype(float, copy=True)
+ null_vals[::2, :] = np.nan
+ null_vals[::3, :] = np.nan
+ df = DataFrame(vals, columns=list("abcde"), dtype=dtype)
+ null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
+ keys = np.random.randint(0, 100, size=N)
+ df["key"] = keys
+ null_df["key"] = keys
+ self.df = df
+ self.null_df = null_df
+
+ def time_frame_transform(self, dtype, method):
+ self.df.groupby("key").transform(method)
+
+ def time_frame_transform_many_nulls(self, dtype, method):
+ self.null_df.groupby("key").transform(method)
+
+
class RankWithTies:
# GH 21237
param_names = ["dtype", "tie_method"]
diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py
index 6c0ca6fac6ec3..e4369d67ca67e 100644
--- a/asv_bench/benchmarks/io/style.py
+++ b/asv_bench/benchmarks/io/style.py
@@ -17,19 +17,19 @@ def setup(self, cols, rows):
def time_apply_render(self, cols, rows):
self._style_apply()
- self.st.render()
+ self.st._render_html()
def peakmem_apply_render(self, cols, rows):
self._style_apply()
- self.st.render()
+ self.st._render_html()
def time_classes_render(self, cols, rows):
self._style_classes()
- self.st.render()
+ self.st._render_html()
def peakmem_classes_render(self, cols, rows):
self._style_classes()
- self.st.render()
+ self.st._render_html()
def _style_apply(self):
def _apply_func(s):
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 76257e1b40f1a..45a9053954569 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -50,91 +50,126 @@ def peakmem_cat_frame_construction(self, dtype):
class Methods:
- def setup(self):
- self.s = Series(tm.makeStringIndex(10 ** 5))
+ params = ["str", "string", "arrow_string"]
+ param_names = ["dtype"]
+
+ def setup(self, dtype):
+ from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
- def time_center(self):
+ try:
+ self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
+ except ImportError:
+ raise NotImplementedError
+
+ def time_center(self, dtype):
self.s.str.center(100)
- def time_count(self):
+ def time_count(self, dtype):
self.s.str.count("A")
- def time_endswith(self):
+ def time_endswith(self, dtype):
self.s.str.endswith("A")
- def time_extract(self):
+ def time_extract(self, dtype):
with warnings.catch_warnings(record=True):
self.s.str.extract("(\\w*)A(\\w*)")
- def time_findall(self):
+ def time_findall(self, dtype):
self.s.str.findall("[A-Z]+")
- def time_find(self):
+ def time_find(self, dtype):
self.s.str.find("[A-Z]+")
- def time_rfind(self):
+ def time_rfind(self, dtype):
self.s.str.rfind("[A-Z]+")
- def time_get(self):
+ def time_get(self, dtype):
self.s.str.get(0)
- def time_len(self):
+ def time_len(self, dtype):
self.s.str.len()
- def time_join(self):
+ def time_join(self, dtype):
self.s.str.join(" ")
- def time_match(self):
+ def time_match(self, dtype):
self.s.str.match("A")
- def time_normalize(self):
+ def time_normalize(self, dtype):
self.s.str.normalize("NFC")
- def time_pad(self):
+ def time_pad(self, dtype):
self.s.str.pad(100, side="both")
- def time_partition(self):
+ def time_partition(self, dtype):
self.s.str.partition("A")
- def time_rpartition(self):
+ def time_rpartition(self, dtype):
self.s.str.rpartition("A")
- def time_replace(self):
+ def time_replace(self, dtype):
self.s.str.replace("A", "\x01\x01")
- def time_translate(self):
+ def time_translate(self, dtype):
self.s.str.translate({"A": "\x01\x01"})
- def time_slice(self):
+ def time_slice(self, dtype):
self.s.str.slice(5, 15, 2)
- def time_startswith(self):
+ def time_startswith(self, dtype):
self.s.str.startswith("A")
- def time_strip(self):
+ def time_strip(self, dtype):
self.s.str.strip("A")
- def time_rstrip(self):
+ def time_rstrip(self, dtype):
self.s.str.rstrip("A")
- def time_lstrip(self):
+ def time_lstrip(self, dtype):
self.s.str.lstrip("A")
- def time_title(self):
+ def time_title(self, dtype):
self.s.str.title()
- def time_upper(self):
+ def time_upper(self, dtype):
self.s.str.upper()
- def time_lower(self):
+ def time_lower(self, dtype):
self.s.str.lower()
- def time_wrap(self):
+ def time_wrap(self, dtype):
self.s.str.wrap(10)
- def time_zfill(self):
+ def time_zfill(self, dtype):
self.s.str.zfill(10)
+ def time_isalnum(self, dtype):
+ self.s.str.isalnum()
+
+ def time_isalpha(self, dtype):
+ self.s.str.isalpha()
+
+ def time_isdecimal(self, dtype):
+ self.s.str.isdecimal()
+
+ def time_isdigit(self, dtype):
+ self.s.str.isdigit()
+
+ def time_islower(self, dtype):
+ self.s.str.islower()
+
+ def time_isnumeric(self, dtype):
+ self.s.str.isnumeric()
+
+ def time_isspace(self, dtype):
+ self.s.str.isspace()
+
+ def time_istitle(self, dtype):
+ self.s.str.istitle()
+
+ def time_isupper(self, dtype):
+ self.s.str.isupper()
+
class Repeat:
@@ -178,13 +213,18 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
class Contains:
- params = [True, False]
- param_names = ["regex"]
+ params = (["str", "string", "arrow_string"], [True, False])
+ param_names = ["dtype", "regex"]
+
+ def setup(self, dtype, regex):
+ from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
- def setup(self, regex):
- self.s = Series(tm.makeStringIndex(10 ** 5))
+ try:
+ self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
+ except ImportError:
+ raise NotImplementedError
- def time_contains(self, regex):
+ def time_contains(self, dtype, regex):
self.s.str.contains("A", regex=regex)
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index d4b6c0d6ff09d..149e10b48933d 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -106,84 +106,43 @@ fi
### DOCTESTS ###
if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
- # Individual files
-
- MSG='Doctests accessor.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/accessor.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests aggregation.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/aggregation.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests base.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/base.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests construction.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/construction.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests frame.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/frame.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests generic.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/generic.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests series.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/series.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests strings.py' ; echo $MSG
- pytest -q --doctest-modules pandas/core/strings/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests sql.py' ; echo $MSG
- pytest -q --doctest-modules pandas/io/sql.py
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- # Directories
-
- MSG='Doctests arrays'; echo $MSG
- pytest -q --doctest-modules pandas/core/arrays/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests computation' ; echo $MSG
- pytest -q --doctest-modules pandas/core/computation/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests dtypes'; echo $MSG
- pytest -q --doctest-modules pandas/core/dtypes/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests groupby' ; echo $MSG
- pytest -q --doctest-modules pandas/core/groupby/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests indexes' ; echo $MSG
- pytest -q --doctest-modules pandas/core/indexes/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests ops' ; echo $MSG
- pytest -q --doctest-modules pandas/core/ops/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests reshape' ; echo $MSG
- pytest -q --doctest-modules pandas/core/reshape/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests tools' ; echo $MSG
- pytest -q --doctest-modules pandas/core/tools/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests window' ; echo $MSG
- pytest -q --doctest-modules pandas/core/window/
- RET=$(($RET + $?)) ; echo $MSG "DONE"
-
- MSG='Doctests tseries' ; echo $MSG
- pytest -q --doctest-modules pandas/tseries/
+ MSG='Doctests for individual files' ; echo $MSG
+ pytest -q --doctest-modules \
+ pandas/core/accessor.py \
+ pandas/core/aggregation.py \
+ pandas/core/algorithms.py \
+ pandas/core/base.py \
+ pandas/core/construction.py \
+ pandas/core/frame.py \
+ pandas/core/generic.py \
+ pandas/core/indexers.py \
+ pandas/core/nanops.py \
+ pandas/core/series.py \
+ pandas/io/sql.py
+ RET=$(($RET + $?)) ; echo $MSG "DONE"
+
+ MSG='Doctests for directories' ; echo $MSG
+ pytest -q --doctest-modules \
+ pandas/_libs/ \
+ pandas/api/ \
+ pandas/arrays/ \
+ pandas/compat/ \
+ pandas/core/array_algos/ \
+ pandas/core/arrays/ \
+ pandas/core/computation/ \
+ pandas/core/dtypes/ \
+ pandas/core/groupby/ \
+ pandas/core/indexes/ \
+ pandas/core/ops/ \
+ pandas/core/reshape/ \
+ pandas/core/strings/ \
+ pandas/core/tools/ \
+ pandas/core/window/ \
+ pandas/errors/ \
+ pandas/io/clipboard/ \
+ pandas/io/parsers/ \
+ pandas/io/sas/ \
+ pandas/tseries/
RET=$(($RET + $?)) ; echo $MSG "DONE"
fi
diff --git a/ci/deps/actions-37-minimum_versions.yaml b/ci/deps/actions-37-minimum_versions.yaml
index 8052156858a32..3237cf9770220 100644
--- a/ci/deps/actions-37-minimum_versions.yaml
+++ b/ci/deps/actions-37-minimum_versions.yaml
@@ -6,7 +6,7 @@ dependencies:
# tools
- cython=0.29.21
- - pytest=5.0.1
+ - pytest>=6.0
- pytest-cov
- pytest-xdist>=1.21
- hypothesis>=3.58.0
diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml
index 61f431256dd4a..f29830e9b3e79 100644
--- a/ci/deps/actions-37.yaml
+++ b/ci/deps/actions-37.yaml
@@ -15,7 +15,7 @@ dependencies:
# pandas dependencies
- botocore>=1.11
- fsspec>=0.7.4
- - numpy
+ - numpy=1.19
- python-dateutil
- nomkl
- pyarrow=0.15.1
diff --git a/ci/run_tests.sh b/ci/run_tests.sh
index f5e3420b8c9b3..261d6364cb5e1 100755
--- a/ci/run_tests.sh
+++ b/ci/run_tests.sh
@@ -19,7 +19,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then
XVFB="xvfb-run "
fi
-PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
+PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas"
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
# GH#37455 windows py38 build appears to be running out of memory
@@ -30,7 +30,7 @@ fi
echo $PYTEST_CMD
sh -c "$PYTEST_CMD"
-PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas"
+PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -m \"$PATTERN and arraymanager\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas"
echo $PYTEST_AM_CMD
sh -c "$PYTEST_AM_CMD"
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index b9afbe387799e..f4a09e0daa750 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -110,8 +110,8 @@ version control to allow many people to work together on the project.
Some great resources for learning Git:
* the `GitHub help pages `_.
-* the `NumPy's documentation `_.
-* Matthew Brett's `Pydagogue `_.
+* the `NumPy documentation `_.
+* Matthew Brett's `Pydagogue `_.
Getting started with Git
------------------------
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index 56aa734deddd6..d53d0556dca04 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -164,6 +164,21 @@ A good implementation for Python users is `has2k1/plotnine `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use.
+`Lux `__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`Lux `__ is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas:
+
+.. code:: python
+
+ import lux
+ import pandas as pd
+
+ df = pd.read_csv("data.csv")
+ df # discover interesting insights!
+
+By printing out a dataframe, Lux automatically `recommends a set of visualizations `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code.
+
`Qtpandas `__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index b6351ac2232ff..16beb00d201b7 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -195,7 +195,7 @@ pandas is equipped with an exhaustive set of unit tests, covering about 97% of
the code base as of this writing. To run it on your machine to verify that
everything is working (and that you have all of the dependencies, soft and hard,
installed), make sure you have `pytest
-`__ >= 5.0.1 and `Hypothesis
+`__ >= 6.0 and `Hypothesis
`__ >= 3.58, then run:
::
@@ -362,6 +362,21 @@ pyarrow 0.15.0 Parquet, ORC, and feather reading /
pyreadstat SPSS files (.sav) reading
========================= ================== =============================================================
+.. _install.warn_orc:
+
+.. warning::
+
+ * If you want to use :func:`~pandas.read_orc`, it is highly recommended to install pyarrow using conda.
+ The following is a summary of the environment in which :func:`~pandas.read_orc` can work.
+
+ ========================= ================== =============================================================
+ System Conda PyPI
+ ========================= ================== =============================================================
+ Linux Successful Failed(pyarrow==3.0 Successful)
+ macOS Successful Failed
+ Windows Failed Failed
+ ========================= ================== =============================================================
+
Access data in the cloud
^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
index 180f833a2753d..1de978b195382 100644
--- a/doc/source/user_guide/gotchas.rst
+++ b/doc/source/user_guide/gotchas.rst
@@ -178,7 +178,7 @@ To test for membership in the values, use the method :meth:`~pandas.Series.isin`
For ``DataFrames``, likewise, ``in`` applies to the column axis,
testing for membership in the list of column names.
-.. _udf-mutation:
+.. _gotchas.udf-mutation:
Mutating with User Defined Function (UDF) methods
-------------------------------------------------
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
index afb2e72cbff07..3f596388ca226 100644
--- a/doc/source/user_guide/groupby.rst
+++ b/doc/source/user_guide/groupby.rst
@@ -739,6 +739,26 @@ optimized Cython implementations:
Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above
code would work even without the special versions via dispatching (see below).
+.. _groupby.aggregate.udfs:
+
+Aggregations with User-Defined Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Users can also provide their own functions for custom aggregations. When aggregating
+with a User-Defined Function (UDF), the UDF should not mutate the provided ``Series``, see
+:ref:`gotchas.udf-mutation` for more information.
+
+.. ipython:: python
+
+ animals.groupby("kind")[["height"]].agg(lambda x: set(x))
+
+The resulting dtype will reflect that of the aggregating function. If the results from different groups have
+different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction.
+
+.. ipython:: python
+
+ animals.groupby("kind")[["height"]].agg(lambda x: x.astype(int).sum())
+
.. _groupby.transform:
Transformation
@@ -759,7 +779,11 @@ as the one being grouped. The transform function must:
* (Optionally) operates on the entire group chunk. If this is supported, a
fast path is used starting from the *second* chunk.
-For example, suppose we wished to standardize the data within each group:
+Similar to :ref:`groupby.aggregate.udfs`, the resulting dtype will reflect that of the
+transformation function. If the results from different groups have different dtypes, then
+a common dtype will be determined in the same way as ``DataFrame`` construction.
+
+Suppose we wished to standardize the data within each group:
.. ipython:: python
@@ -1065,13 +1089,16 @@ that is itself a series, and possibly upcast the result to a DataFrame:
s
s.apply(f)
-
.. note::
``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it.
So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in
the output as well as set the indices.
+Similar to :ref:`groupby.aggregate.udfs`, the resulting dtype will reflect that of the
+apply function. If the results from different groups have different dtypes, then
+a common dtype will be determined in the same way as ``DataFrame`` construction.
+
Numba Accelerated Routines
--------------------------
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 3b7a6037a9715..5148bb87b0eb0 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5443,6 +5443,11 @@ Similar to the :ref:`parquet ` format, the `ORC Format `__ library.
+.. warning::
+
+ * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
+ * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `.
+
.. _io.sql:
SQL queries
diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst
index 1fcaac1a91d09..278eb907102ed 100644
--- a/doc/source/user_guide/options.rst
+++ b/doc/source/user_guide/options.rst
@@ -456,6 +456,10 @@ io.hdf.dropna_table True drop ALL nan rows when appe
io.parquet.engine None The engine to use as a default for
parquet reading and writing. If None
then try 'pyarrow' and 'fastparquet'
+io.sql.engine None The engine to use as a default for
+ sql reading and writing, with SQLAlchemy
+ as a higher level interface. If None
+ then try 'sqlalchemy'
mode.chained_assignment warn Controls ``SettingWithCopyWarning``:
'raise', 'warn', or None. Raise an
exception, warn, or no action if
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
index 765b2929d3014..86696cc909764 100644
--- a/doc/source/user_guide/style.ipynb
+++ b/doc/source/user_guide/style.ipynb
@@ -1006,7 +1006,30 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We expect certain styling functions to be common enough that we've included a few \"built-in\" to the `Styler`, so you don't have to write them yourself."
+ "Some styling functions are common enough that we've \"built them in\" to the `Styler`, so you don't have to write them and apply them yourself. The current list of such functions is:\n",
+ "\n",
+ " - [.highlight_null][nullfunc]: for use with identifying missing data. \n",
+ " - [.highlight_min][minfunc] and [.highlight_max][maxfunc]: for use with identifying extremeties in data.\n",
+ " - [.highlight_between][betweenfunc] and [.highlight_quantile][quantilefunc]: for use with identifying classes within data.\n",
+ " - [.background_gradient][bgfunc]: a flexible method for highlighting cells based or their, or other, values on a numeric scale.\n",
+ " - [.bar][barfunc]: to display mini-charts within cell backgrounds.\n",
+ " \n",
+ "The individual documentation on each function often gives more examples of their arguments.\n",
+ "\n",
+ "[nullfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_null.rst\n",
+ "[minfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_min.rst\n",
+ "[maxfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_max.rst\n",
+ "[betweenfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_between.rst\n",
+ "[quantilefunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_quantile.rst\n",
+ "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst\n",
+ "[barfunc]: ../reference/api/pandas.io.formats.style.Styler.bar.rst"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Highlight Null"
]
},
{
@@ -1017,14 +1040,14 @@
"source": [
"df2.iloc[0,2] = np.nan\n",
"df2.iloc[4,3] = np.nan\n",
- "df2.loc[:4].style.highlight_null(null_color='red')"
+ "df2.loc[:4].style.highlight_null(null_color='yellow')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can create \"heatmaps\" with the `background_gradient` method. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap."
+ "### Highlight Min or Max"
]
},
{
@@ -1033,17 +1056,15 @@
"metadata": {},
"outputs": [],
"source": [
- "import seaborn as sns\n",
- "cm = sns.light_palette(\"green\", as_cmap=True)\n",
- "\n",
- "df2.style.background_gradient(cmap=cm)"
+ "df2.loc[:4].style.highlight_max(axis=1, props='color:white; font-weight:bold; background-color:darkblue;')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "`Styler.background_gradient` takes the keyword arguments `low` and `high`. Roughly speaking these extend the range of your data by `low` and `high` percent so that when we convert the colors, the colormap's entire range isn't used. This is useful so that you can actually read the text still."
+ "### Highlight Between\n",
+ "This method accepts ranges as float, or NumPy arrays or Series provided the indexes match."
]
},
{
@@ -1052,8 +1073,16 @@
"metadata": {},
"outputs": [],
"source": [
- "# Uses the full color range\n",
- "df2.loc[:4].style.background_gradient(cmap='viridis')"
+ "left = pd.Series([1.0, 0.0, 1.0], index=[\"A\", \"B\", \"D\"])\n",
+ "df2.loc[:4].style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:purple;')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Highlight Quantile\n",
+ "Useful for detecting the highest or lowest percentile values"
]
},
{
@@ -1062,17 +1091,21 @@
"metadata": {},
"outputs": [],
"source": [
- "# Compress the color range\n",
- "df2.loc[:4].style\\\n",
- " .background_gradient(cmap='viridis', low=.5, high=0)\\\n",
- " .highlight_null('red')"
+ "df2.loc[:4].style.highlight_quantile(q_left=0.85, axis=None, color='yellow')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Background Gradient"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "There's also `.highlight_min` and `.highlight_max`, which is almost identical to the user defined version we created above, and also a `.highlight_null` method. "
+ "You can create \"heatmaps\" with the `background_gradient` method. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap."
]
},
{
@@ -1081,7 +1114,19 @@
"metadata": {},
"outputs": [],
"source": [
- "df2.loc[:4].style.highlight_max(axis=0)"
+ "import seaborn as sns\n",
+ "cm = sns.light_palette(\"green\", as_cmap=True)\n",
+ "\n",
+ "df2.style.background_gradient(cmap=cm)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "[.background_gradient][bgfunc] has a number of keyword arguments to customise the gradients and colors. See its documentation.\n",
+ "\n",
+ "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst"
]
},
{
diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst
index 16f9284802407..60e146b2212eb 100644
--- a/doc/source/whatsnew/v1.2.5.rst
+++ b/doc/source/whatsnew/v1.2.5.rst
@@ -15,7 +15,7 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`)
--
+- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`)
-
.. ---------------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index a286d152f03c3..bf63a51204f5c 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -801,10 +801,13 @@ Plotting
- Bug in :func:`scatter_matrix` raising when 2d ``ax`` argument passed (:issue:`16253`)
- Prevent warnings when matplotlib's ``constrained_layout`` is enabled (:issue:`25261`)
+- Bug in :meth:`BoxPlot._validate_color_args` In box plot when 'dark_background' theme was selected caps or min/max markers for the plot was not visible (:issue:`40769`)
- Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``yerr`` while others didn't (partial fix of :issue:`39522`)
- Bug in :func:`DataFrame.plot` was showing the wrong colors in the legend if the function was called repeatedly and some calls used ``secondary_y`` and others use ``legend=False`` (:issue:`40044`)
+
+
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` with :class:`PeriodDtype` columns incorrectly casting results too aggressively (:issue:`38254`)
@@ -839,6 +842,7 @@ Groupby/resample/rolling
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
- Bug in :meth:`.GroupBy.rank` with nullable dtypes incorrectly raising ``TypeError`` (:issue:`41010`)
+
Reshaping
^^^^^^^^^
- Bug in :func:`merge` raising error when performing an inner join with partial index and ``right_index`` when no overlap between indices (:issue:`33814`)
diff --git a/environment.yml b/environment.yml
index 0d03ad8e0a46a..146bf6db08d8b 100644
--- a/environment.yml
+++ b/environment.yml
@@ -20,7 +20,7 @@ dependencies:
# code checks
- black=20.8b1
- cpplint
- - flake8=3.9.1
+ - flake8=3.9.0
- flake8-bugbear=21.3.2 # used by flake8, find likely bugs
- flake8-comprehensions=3.1.0 # used by flake8, linting of unnecessary comprehensions
- isort>=5.2.1 # check that imports are in the right order
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 8fb307150a48f..3fa92ce2229c3 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1277,6 +1277,7 @@ def group_min(groupby_t[:, ::1] out,
@cython.wraparound(False)
cdef group_cummin_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
+ uint8_t[:, ::1] mask,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
@@ -1290,6 +1291,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
Array to store cummin/max in.
values : np.ndarray[groupby_t, ndim=2]
Values to take cummin/max of.
+ mask : np.ndarray[bool] or None
+ If not None, indices represent missing values,
+ otherwise the mask will not be used
labels : np.ndarray[np.intp]
Labels to group by.
ngroups : int
@@ -1307,11 +1311,14 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
cdef:
Py_ssize_t i, j, N, K, size
groupby_t val, mval
- ndarray[groupby_t, ndim=2] accum
+ groupby_t[:, ::1] accum
intp_t lab
+ bint val_is_nan, use_mask
+
+ use_mask = mask is not None
N, K = (