sthagen · sthagen · Mar 15, 2020 · Mar 14, 2020 · Mar 14, 2020 · Mar 14, 2020
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -1,4 +1,10 @@
 # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml
+trigger:
+- master
+
+pr:
+- master
+
 jobs:
 # Mac and Linux use the same template
 - template: ci/azure/posix.yml

diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
@@ -38,11 +38,11 @@ jobs:
           LC_ALL: "it_IT.utf8"
           EXTRA_APT: "language-pack-it xsel"
 
-        py36_32bit:
-          ENV_FILE: ci/deps/azure-36-32bit.yaml
-          CONDA_PY: "36"
-          PATTERN: "not slow and not network and not clipboard"
-          BITS32: "yes"
+        #py36_32bit:
+        #  ENV_FILE: ci/deps/azure-36-32bit.yaml
+        #  CONDA_PY: "36"
+        #  PATTERN: "not slow and not network and not clipboard"
+        #  BITS32: "yes"
 
         py37_locale:
           ENV_FILE: ci/deps/azure-37-locale.yaml

diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst
@@ -397,6 +397,28 @@ The result will be a DataFrame with the same index as the input Series, and
 with one column whose name is the original name of the Series (only if no other
 column name provided).
 
+.. _basics.dataframe.from_list_dataclasses:
+
+From a list of dataclasses
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 1.1.0
+
+Data Classes as introduced in `PEP557 <https://www.python.org/dev/peps/pep-0557>`__,
+can be passed into the DataFrame constructor.
+Passing a list of dataclasses is equivilent to passing a list of dictionaries.
+
+Please be aware, that that all values in the list should be dataclasses, mixing
+types in the list would result in a TypeError.
+
+.. ipython:: python
+
+    from dataclasses import make_dataclass
+
+    Point = make_dataclass("Point", [("x", int), ("y", int)])
+
+    pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
+
 **Missing data**
 
 Much more will be said on this topic in the :ref:`Missing data <missing_data>`

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -168,14 +168,46 @@ key and type of :class:`Index`.  These now consistently raise ``KeyError`` (:iss
 
 .. ---------------------------------------------------------------------------
 
+.. _whatsnew_110.api_breaking.assignment_to_multiple_columns:
+
+Assignment to multiple columns of a DataFrame when some columns do not exist
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]})
+   df
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+   In [3]: df[['a', 'c']] = 1
+   In [4]: df
+   Out[4]:
+      a  b
+   0  1  1
+   1  1  1
+   2  1  1
+
+*New behavior*:
+
+.. ipython:: python
+
+   df[['a', 'c']] = 1
+   df
+
 .. _whatsnew_110.deprecations:
 
 Deprecations
 ~~~~~~~~~~~~
 - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version.  Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)
 - :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
 - Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version.  Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
--
+- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`)
+- :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`)
 
 .. ---------------------------------------------------------------------------
 
@@ -190,7 +222,7 @@ Performance improvements
 - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`)
 - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
   avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
-  existing indexes (:issue:`28584`, :issue:`32640`)
+  existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
 
 .. ---------------------------------------------------------------------------
 
@@ -216,6 +248,7 @@ Datetimelike
 - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`)
 - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`)
 - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`)
+- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`)
 
 Timedelta
 ^^^^^^^^^
@@ -241,7 +274,7 @@ Conversion
 ^^^^^^^^^^
 - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`)
 - Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`32402`)
--
+- Bug in :class:`DataFrame` construction where sets would be duplicated rather than raising (:issue:`32582`)
 
 Strings
 ^^^^^^^
@@ -306,6 +339,7 @@ I/O
 - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`)
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
 - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`)
+- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`)
 
 
 Plotting
@@ -335,6 +369,8 @@ Reshaping
 - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`)
 - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`)
 - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`)
+- Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`)
+- Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`)
 
 
 Sparse
@@ -356,6 +392,7 @@ Other
   instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`)
 - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`)
 - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`)
+- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -1,4 +1,5 @@
 import cython
+from collections import defaultdict
 from cython import Py_ssize_t
 
 from cpython.slice cimport PySlice_GetIndicesEx
@@ -7,7 +8,9 @@ cdef extern from "Python.h":
     Py_ssize_t PY_SSIZE_T_MAX
 
 import numpy as np
-from numpy cimport int64_t
+cimport numpy as cnp
+from numpy cimport NPY_INT64, int64_t
+cnp.import_array()
 
 from pandas._libs.algos import ensure_int64
 
@@ -105,7 +108,9 @@ cdef class BlockPlacement:
             Py_ssize_t start, stop, end, _
         if not self._has_array:
             start, stop, step, _ = slice_get_indices_ex(self._as_slice)
-            self._as_array = np.arange(start, stop, step, dtype=np.int64)
+            # NOTE: this is the C-optimized equivalent of
+            #  np.arange(start, stop, step, dtype=np.int64)
+            self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64)
             self._has_array = True
         return self._as_array
 
@@ -369,8 +374,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True):
         Py_ssize_t i, start, stop, n, diff
 
         object blkno
-        list group_order
-        dict group_dict
+        object group_dict = defaultdict(list)
         int64_t[:] res_view
 
     n = blknos.shape[0]
@@ -391,28 +395,16 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True):
 
         yield cur_blkno, slice(start, n)
     else:
-        group_order = []
-        group_dict = {}
-
         for i in range(1, n):
             if blknos[i] != cur_blkno:
-                if cur_blkno not in group_dict:
-                    group_order.append(cur_blkno)
-                    group_dict[cur_blkno] = [(start, i)]
-                else:
-                    group_dict[cur_blkno].append((start, i))
+                group_dict[cur_blkno].append((start, i))
 
                 start = i
                 cur_blkno = blknos[i]
 
-        if cur_blkno not in group_dict:
-            group_order.append(cur_blkno)
-            group_dict[cur_blkno] = [(start, n)]
-        else:
-            group_dict[cur_blkno].append((start, n))
+        group_dict[cur_blkno].append((start, n))
 
-        for blkno in group_order:
-            slices = group_dict[blkno]
+        for blkno, slices in group_dict.items():
             if len(slices) == 1:
                 yield blkno, slice(slices[0][0], slices[0][1])
             else:

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -425,6 +425,15 @@ def nselect_method(request):
     return request.param
 
 
+@pytest.fixture(params=["first", "last", False])
+def keep(request):
+    """
+    Valid values for the 'keep' parameter used in
+    .duplicated or .drop_duplicates
+    """
+    return request.param
+
+
 @pytest.fixture(params=["left", "right", "both", "neither"])
 def closed(request):
     """

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -11,6 +11,7 @@
 
 from pandas._libs import Timestamp, algos, hashtable as htable, lib
 from pandas._libs.tslib import iNaT
+from pandas._typing import AnyArrayLike
 from pandas.util._decorators import doc
 
 from pandas.core.dtypes.cast import (
@@ -45,10 +46,14 @@
     is_unsigned_integer_dtype,
     needs_i8_conversion,
 )
-from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
+from pandas.core.dtypes.generic import (
+    ABCExtensionArray,
+    ABCIndex,
+    ABCIndexClass,
+    ABCSeries,
+)
 from pandas.core.dtypes.missing import isna, na_value_for_dtype
 
-import pandas.core.common as com
 from pandas.core.construction import array, extract_array
 from pandas.core.indexers import validate_indices
 
@@ -384,7 +389,7 @@ def unique(values):
 unique1d = unique
 
 
-def isin(comps, values) -> np.ndarray:
+def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
     """
     Compute the isin boolean array.
 
@@ -409,15 +414,14 @@ def isin(comps, values) -> np.ndarray:
             f"to isin(), you passed a [{type(values).__name__}]"
         )
 
-    if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
+    if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
         values = construct_1d_object_array_from_listlike(list(values))
 
+    comps = extract_array(comps, extract_numpy=True)
     if is_categorical_dtype(comps):
         # TODO(extension)
         # handle categoricals
-        return comps._values.isin(values)
-
-    comps = com.values_from_object(comps)
+        return comps.isin(values)  # type: ignore
 
     comps, dtype = _ensure_data(comps)
     values, _ = _ensure_data(values, dtype=dtype)
@@ -2021,9 +2025,7 @@ def sort_mixed(values):
         )
     codes = ensure_platform_int(np.asarray(codes))
 
-    from pandas import Index
-
-    if not assume_unique and not Index(values).is_unique:
+    if not assume_unique and not len(unique(values)) == len(values):
         raise ValueError("values should be unique if codes is not None")
 
     if sorter is None:

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -179,7 +179,7 @@ def get_result(self):
             return self.apply_empty_result()
 
         # raw
-        elif self.raw and not self.obj._is_mixed_type:
+        elif self.raw:
             return self.apply_raw()
 
         return self.apply_standard()

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1675,6 +1675,12 @@ def to_dense(self):
         -------
         dense : array
         """
+        warn(
+            "Categorical.to_dense is deprecated and will be removed in "
+            "a future version.  Use np.asarray(cat) instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
         return np.asarray(self)
 
     def fillna(self, value=None, method=None, limit=None):
@@ -1733,12 +1739,17 @@ def fillna(self, value=None, method=None, limit=None):
 
             # If value is a dict or a Series (a dict value has already
             # been converted to a Series)
-            if isinstance(value, ABCSeries):
-                if not value[~value.isin(self.categories)].isna().all():
+            if isinstance(value, (np.ndarray, Categorical, ABCSeries)):
+                # We get ndarray or Categorical if called via Series.fillna,
+                #  where it will unwrap another aligned Series before getting here
+
+                mask = ~algorithms.isin(value, self.categories)
+                if not isna(value[mask]).all():
                     raise ValueError("fill value must be in categories")
 
                 values_codes = _get_codes_for_values(value, self.categories)
                 indexer = np.where(codes == -1)
+                codes = codes.copy()
                 codes[indexer] = values_codes[indexer]
 
             # If value is not a dict or Series it should be a scalar

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -23,6 +23,7 @@
 from pandas.core.dtypes.common import (
     _INT64_DTYPE,
     _NS_DTYPE,
+    is_bool_dtype,
     is_categorical_dtype,
     is_datetime64_any_dtype,
     is_datetime64_dtype,
@@ -1903,32 +1904,36 @@ def maybe_convert_dtype(data, copy):
     ------
     TypeError : PeriodDType data is passed
     """
-    if is_float_dtype(data):
+    if not hasattr(data, "dtype"):
+        # e.g. collections.deque
+        return data, copy
+
+    if is_float_dtype(data.dtype):
         # Note: we must cast to datetime64[ns] here in order to treat these
         #  as wall-times instead of UTC timestamps.
         data = data.astype(_NS_DTYPE)
         copy = False
         # TODO: deprecate this behavior to instead treat symmetrically
         #  with integer dtypes.  See discussion in GH#23675
 
-    elif is_timedelta64_dtype(data):
+    elif is_timedelta64_dtype(data.dtype) or is_bool_dtype(data.dtype):
         # GH#29794 enforcing deprecation introduced in GH#23539
         raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]")
-    elif is_period_dtype(data):
+    elif is_period_dtype(data.dtype):
         # Note: without explicitly raising here, PeriodIndex
         #  test_setops.test_join_does_not_recur fails
         raise TypeError(
             "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead"
         )
 
-    elif is_categorical_dtype(data):
+    elif is_categorical_dtype(data.dtype):
         # GH#18664 preserve tz in going DTI->Categorical->DTI
         # TODO: cases where we need to do another pass through this func,
         #  e.g. the categories are timedelta64s
         data = data.categories.take(data.codes, fill_value=NaT)._values
         copy = False
 
-    elif is_extension_array_dtype(data) and not is_datetime64tz_dtype(data):
+    elif is_extension_array_dtype(data.dtype) and not is_datetime64tz_dtype(data.dtype):
         # Includes categorical
         # TODO: We have no tests for these
         data = np.array(data, dtype=np.object_)