pandas-dev · SultanOrazbayev · Mar 23, 2020 · Mar 23, 2020 · Mar 24, 2020 · Mar 24, 2020
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -223,27 +223,27 @@ def time_series_datetimeindex_repr(self):
 
 class All:
 
-    params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
-    param_names = ["N", "case"]
+    params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]]
+    param_names = ["N", "case", "dtype"]
 
-    def setup(self, N, case):
+    def setup(self, N, case, dtype):
         val = case != "fast"
-        self.s = Series([val] * N)
+        self.s = Series([val] * N, dtype=dtype)
 
-    def time_all(self, N, case):
+    def time_all(self, N, case, dtype):
         self.s.all()
 
 
 class Any:
 
-    params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
-    param_names = ["N", "case"]
+    params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]]
+    param_names = ["N", "case", "dtype"]
 
-    def setup(self, N, case):
+    def setup(self, N, case, dtype):
         val = case == "fast"
-        self.s = Series([val] * N)
+        self.s = Series([val] * N, dtype=dtype)
 
-    def time_any(self, N, case):
+    def time_any(self, N, case, dtype):
         self.s.any()
 
 
@@ -265,11 +265,14 @@ class NanOps:
             "prod",
         ],
         [10 ** 3, 10 ** 6],
-        ["int8", "int32", "int64", "float64"],
+        ["int8", "int32", "int64", "float64", "Int64", "boolean"],
     ]
     param_names = ["func", "N", "dtype"]
 
     def setup(self, func, N, dtype):
+        if func == "argmax" and dtype in {"Int64", "boolean"}:
+            # Skip argmax for nullable int since this doesn't work yet (GH-24382)
+            raise NotImplementedError
         self.s = Series([1] * N, dtype=dtype)
         self.func = getattr(self.s, func)
 

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
@@ -7,11 +7,17 @@
 
 class FrameOps:
 
-    params = [ops, ["float", "int"], [0, 1]]
+    params = [ops, ["float", "int", "Int64"], [0, 1]]
     param_names = ["op", "dtype", "axis"]
 
     def setup(self, op, dtype, axis):
-        df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
+        if op == "mad" and dtype == "Int64" and axis == 1:
+            # GH-33036
+            raise NotImplementedError
+        values = np.random.randn(100000, 4)
+        if dtype == "Int64":
+            values = values.astype(int)
+        df = pd.DataFrame(values).astype(dtype)
         self.df_func = getattr(df, op)
 
     def time_op(self, op, dtype, axis):

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -283,14 +283,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
     pytest -q --doctest-modules pandas/core/tools/datetimes.py
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    MSG='Doctests top-level reshaping functions' ; echo $MSG
-    pytest -q --doctest-modules \
-        pandas/core/reshape/concat.py \
-        pandas/core/reshape/pivot.py \
-        pandas/core/reshape/reshape.py \
-        pandas/core/reshape/tile.py \
-        pandas/core/reshape/melt.py \
-        -k"-crosstab -pivot_table -cut"
+    MSG='Doctests reshaping functions' ; echo $MSG
+    pytest -q --doctest-modules pandas/core/reshape/
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Doctests interval classes' ; echo $MSG
@@ -325,6 +319,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
     MSG='Doctests generic.py' ; echo $MSG
     pytest -q --doctest-modules pandas/core/generic.py
     RET=$(($RET + $?)) ; echo $MSG "DONE"
+
+    MSG='Doctests tseries' ; echo $MSG
+    pytest -q --doctest-modules pandas/tseries/
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
 fi
 
 ### DOCSTRINGS ###

diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst
@@ -118,7 +118,7 @@ done by requesting the pandas ``dtypes`` attribute:
     titanic.dtypes
 
 For each of the columns, the used data type is enlisted. The data types
-in this ``DataFrame`` are integers (``int64``), floats (``float63``) and
+in this ``DataFrame`` are integers (``int64``), floats (``float64``) and
 strings (``object``).
 
 .. note::

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -28,7 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
     binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
     binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
-    binary;`ORC Format <//https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;
+    binary;`ORC Format <https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;
     binary;`Msgpack <https://msgpack.org/index.html>`__;:ref:`read_msgpack<io.msgpack>`;:ref:`to_msgpack<io.msgpack>`
     binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
     binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
@@ -4817,7 +4817,7 @@ ORC
 
 .. versionadded:: 1.0.0
 
-Similar to the :ref:`parquet <io.parquet>` format, the `ORC Format <//https://orc.apache.org/>`__ is a binary columnar serialization
+Similar to the :ref:`parquet <io.parquet>` format, the `ORC Format <https://orc.apache.org/>`__ is a binary columnar serialization
 for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the
 ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.
 

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -641,21 +641,40 @@ You can check whether elements contain a pattern:
 .. ipython:: python
 
    pattern = r'[0-9][a-z]'
-   pd.Series(['1', '2', '3a', '3b', '03c'],
+   pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
              dtype="string").str.contains(pattern)
 
 Or whether elements match a pattern:
 
 .. ipython:: python
 
-   pd.Series(['1', '2', '3a', '3b', '03c'],
+   pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
              dtype="string").str.match(pattern)
 
-The distinction between ``match`` and ``contains`` is strictness: ``match``
-relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
+.. versionadded:: 1.1.0
 
-Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
-an extra ``na`` argument so missing values can be considered True or False:
+.. ipython:: python
+
+   pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
+             dtype="string").str.fullmatch(pattern)
+
+.. note::
+
+    The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness:
+    ``fullmatch`` tests whether the entire string matches the regular expression;
+    ``match`` tests whether there is a match of the regular expression that begins
+    at the first character of the string; and ``contains`` tests whether there is
+    a match of the regular expression at any position within the string.
+
+    The corresponding functions in the ``re`` package for these three match modes are
+    `re.fullmatch <https://docs.python.org/3/library/re.html#re.fullmatch>`_,
+    `re.match <https://docs.python.org/3/library/re.html#re.match>`_, and
+    `re.search <https://docs.python.org/3/library/re.html#re.search>`_,
+    respectively.
+
+Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and
+``endswith`` take an extra ``na`` argument so missing values can be considered
+True or False:
 
 .. ipython:: python
 

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -69,6 +69,7 @@ Other enhancements
 - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
 - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`)
 - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`)
+- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
 - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
 -
 
@@ -167,6 +168,32 @@ key and type of :class:`Index`.  These now consistently raise ``KeyError`` (:iss
     ...
     KeyError: Timestamp('1970-01-01 00:00:00')
 
+:meth:`DataFrame.merge` preserves right frame's row order
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
+
+.. ipython:: python
+
+    left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]})
+    right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]})
+    left_df
+    right_df
+
+*Previous behavior*:
+
+.. code-block:: python
+
+    >>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
+        animal  max_speed
+    0      pig         11
+    1  quetzal         80
+
+*New behavior*:
+
+.. ipython:: python
+
+    left_df.merge(right_df, on=['animal', 'max_speed'], how="right")
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_110.api_breaking.assignment_to_multiple_columns:
@@ -228,6 +255,8 @@ Performance improvements
   sparse values from ``scipy.sparse`` matrices using the
   :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
   :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
+- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`).
+
 
 .. ---------------------------------------------------------------------------
 
@@ -254,13 +283,14 @@ Datetimelike
 - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`)
 - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`)
 - Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`)
+- Bug in :meth:`DatetimeIndex.searchsorted` not accepting a ``list`` or :class:`Series` as its argument (:issue:`32762`)
 
 Timedelta
 ^^^^^^^^^
 
 - Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`)
 - Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta`` incorrectly returning ``NaT`` (:issue:`31869`)
--
+- Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`)
 
 Timezones
 ^^^^^^^^^
@@ -286,7 +316,7 @@ Conversion
 Strings
 ^^^^^^^
 
--
+- Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`).
 -
 
 
@@ -308,6 +338,9 @@ Indexing
 - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`)
 - Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`)
 - Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`)
+- Fix to preserve the ability to index with the "nearest" method with xarray's CFTimeIndex, an :class:`Index` subclass (`pydata/xarray#3751 <https://github.com/pydata/xarray/issues/3751>`_, :issue:`32905`).
+- Bug in :class:`Index` constructor where an unhelpful error message was raised for ``numpy`` scalars (:issue:`33017`)
+- Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`)
 
 Missing
 ^^^^^^^
@@ -369,6 +402,8 @@ Groupby/resample/rolling
 
 - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
 - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`)
+- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`)
+
 
 Reshaping
 ^^^^^^^^^
@@ -381,11 +416,16 @@ Reshaping
 - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`)
 - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`)
 - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`)
+- Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`)
 - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`)
 - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`)
+- Bug on inplace operation of a Series that was adding a column to the DataFrame from where it was originally dropped from (using inplace=True) (:issue:`30484`)
 - Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`)
 - Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`)
+- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`)
 - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`)
+- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`)
+
 
 Sparse
 ^^^^^^

diff --git a/environment.yml b/environment.yml
@@ -101,6 +101,7 @@ dependencies:
   - s3fs  # pandas.read_csv... when using 's3://...' path
   - sqlalchemy  # pandas.read_sql, DataFrame.to_sql
   - xarray  # DataFrame.to_xarray
+  - cftime  # Needed for downstream xarray.CFTimeIndex test
   - pyreadstat  # pandas.read_spss
   - tabulate>=0.8.3  # DataFrame.to_markdown
   - pip:

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -38,8 +38,15 @@ cimport pandas._libs.util as util
 from pandas._libs.util cimport numeric, get_nat
 
 from pandas._libs.khash cimport (
-    khiter_t, kh_destroy_int64, kh_put_int64, kh_init_int64, kh_int64_t,
-    kh_resize_int64, kh_get_int64)
+    kh_destroy_int64,
+    kh_get_int64,
+    kh_init_int64,
+    kh_int64_t,
+    kh_put_int64,
+    kh_resize_int64,
+    khiter_t,
+)
+
 
 import pandas._libs.missing as missing
 
@@ -791,8 +798,13 @@ ctypedef fused rank_t:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def rank_1d(rank_t[:] in_arr, ties_method='average',
-            ascending=True, na_option='keep', pct=False):
+def rank_1d(
+    rank_t[:] in_arr,
+    ties_method="average",
+    bint ascending=True,
+    na_option="keep",
+    bint pct=False,
+):
     """
     Fast NaN-friendly version of ``scipy.stats.rankdata``.
     """
@@ -1009,8 +1021,14 @@ def rank_1d(rank_t[:] in_arr, ties_method='average',
         return ranks
 
 
-def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average',
-            ascending=True, na_option='keep', pct=False):
+def rank_2d(
+    rank_t[:, :] in_arr,
+    int axis=0,
+    ties_method="average",
+    bint ascending=True,
+    na_option="keep",
+    bint pct=False,
+):
     """
     Fast NaN-friendly version of ``scipy.stats.rankdata``.
     """
@@ -1190,9 +1208,12 @@ ctypedef fused out_t:
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def diff_2d(diff_t[:, :] arr,
-            out_t[:, :] out,
-            Py_ssize_t periods, int axis):
+def diff_2d(
+    diff_t[:, :] arr,
+    out_t[:, :] out,
+    Py_ssize_t periods,
+    int axis,
+):
     cdef:
         Py_ssize_t i, j, sx, sy, start, stop
         bint f_contig = arr.is_f_contig()