pandas-dev
diff --git a/‎asv_bench/benchmarks/indexing.py
+4-3 b/‎asv_bench/benchmarks/indexing.py
+4-3
diff --git a/‎asv_bench/benchmarks/indexing_engines.py
+54 b/‎asv_bench/benchmarks/indexing_engines.py
+54
diff --git a/‎ci/azure-windows-36.yaml
+2 b/‎ci/azure-windows-36.yaml
+2
diff --git a/‎ci/code_checks.sh
+5 b/‎ci/code_checks.sh
+5
diff --git a/‎ci/environment-dev.yaml
+1 b/‎ci/environment-dev.yaml
+1
diff --git a/‎ci/requirements_dev.txt
+1 b/‎ci/requirements_dev.txt
+1
diff --git a/‎ci/travis-36.yaml
+1 b/‎ci/travis-36.yaml
+1
diff --git a/‎doc/source/whatsnew/v0.24.0.txt
+43-2 b/‎doc/source/whatsnew/v0.24.0.txt
+43-2
diff --git a/‎pandas/_libs/algos_common_helper.pxi.in
+14-17 b/‎pandas/_libs/algos_common_helper.pxi.in
+14-17
diff --git a/‎pandas/_libs/algos_rank_helper.pxi.in
+21-36 b/‎pandas/_libs/algos_rank_helper.pxi.in
+21-36
@@ -2,16 +2,17 @@
 
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index,
-                    Float64Index, IntervalIndex, CategoricalIndex,
+from pandas import (Series, DataFrame, Panel, MultiIndex,
+                    Int64Index, UInt64Index, Float64Index,
+                    IntervalIndex, CategoricalIndex,
                     IndexSlice, concat, date_range)
 
 
 class NumericSeriesIndexing(object):
 
     goal_time = 0.2
     params = [
-        (Int64Index, Float64Index),
+        (Int64Index, UInt64Index, Float64Index),
         ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
     ]
     param_names = ['index_dtype', 'index_structure']
 
@@ -0,0 +1,54 @@
+import numpy as np
+
+from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
+                                ObjectEngine)
+
+
+class NumericEngineIndexing(object):
+
+    goal_time = 0.2
+    params = [[Int64Engine, UInt64Engine, Float64Engine],
+              [np.int64, np.uint64, np.float64],
+              ['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
+              ]
+    param_names = ['engine', 'dtype', 'index_type']
+
+    def setup(self, engine, dtype, index_type):
+        N = 10**5
+        values = list([1] * N + [2] * N + [3] * N)
+        arr = {
+            'monotonic_incr': np.array(values, dtype=dtype),
+            'monotonic_decr': np.array(list(reversed(values)),
+                                       dtype=dtype),
+            'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype),
+        }[index_type]
+
+        self.data = engine(lambda: arr, len(arr))
+        # code belows avoids populating the mapping etc. while timing.
+        self.data.get_loc(2)
+
+    def time_get_loc(self, engine, dtype, index_type):
+        self.data.get_loc(2)
+
+
+class ObjectEngineIndexing(object):
+
+    goal_time = 0.2
+    params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')]
+    param_names = ['index_type']
+
+    def setup(self, index_type):
+        N = 10**5
+        values = list('a' * N + 'b' * N + 'c' * N)
+        arr = {
+            'monotonic_incr': np.array(values, dtype=object),
+            'monotonic_decr': np.array(list(reversed(values)), dtype=object),
+            'non_monotonic': np.array(list('abc') * N, dtype=object),
+        }[index_type]
+
+        self.data = ObjectEngine(lambda: arr, len(arr))
+        # code belows avoids populating the mapping etc. while timing.
+        self.data.get_loc('b')
+
+    def time_get_loc(self, index_type):
+        self.data.get_loc('b')
@@ -5,12 +5,14 @@ channels:
 dependencies:
   - blosc
   - bottleneck
+  - boost-cpp<1.67
   - fastparquet
   - feather-format
   - matplotlib
   - numexpr
   - numpy=1.14*
   - openpyxl=2.5.5
+  - parquet-cpp
   - pyarrow
   - pytables
   - python-dateutil
 
@@ -56,6 +56,11 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
     cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    # Imports - Check formatting using isort see setup.cfg for settings
+    MSG='Check import format using isort ' ; echo $MSG
+    isort --recursive --check-only pandas
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
 fi
 
 ### PATTERNS ###
 
@@ -8,6 +8,7 @@ dependencies:
   - flake8
   - flake8-comprehensions
   - hypothesis>=3.58.0
+  - isort
   - moto
   - pytest>=3.6
   - python-dateutil>=2.5.0
 
@@ -5,6 +5,7 @@ NumPy
 flake8
 flake8-comprehensions
 hypothesis>=3.58.0
+isort
 moto
 pytest>=3.6
 python-dateutil>=2.5.0
 
@@ -14,6 +14,7 @@ dependencies:
   - geopandas
   - html5lib
   - ipython
+  - isort
   - jinja2
   - lxml
   - matplotlib
 
@@ -198,6 +198,8 @@ Other Enhancements
 - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
 - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
 - :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
+- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
+  all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
 - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
 - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
@@ -440,15 +442,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
 Raise ValueError in ``DataFrame.to_dict(orient='index')``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with 
+Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
 ``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
 
 .. ipython:: python
     :okexcept:
 
     df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
     df
-    
+
     df.to_dict(orient='index')
 
 .. _whatsnew_0240.api.datetimelike.normalize:
@@ -626,6 +628,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
 - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
+- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`)
 - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
 - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
 - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
@@ -788,6 +791,7 @@ Categorical
 ^^^^^^^^^^^
 
 - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
+- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
 - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
 - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
 
@@ -923,6 +927,41 @@ MultiIndex
 I/O
 ^^^
 
+.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
+
+Proper handling of `np.NaN` in a string data-typed column with the Python engine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There was bug in :func:`read_excel` and :func:`read_csv` with the Python
+engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
+``na_filter=True``. Now, these missing values are converted to the string
+missing indicator, ``np.nan``. (:issue `20377`)
+
+.. ipython:: python
+   :suppress:
+
+   from pandas.compat import StringIO
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+   In [5]: data = 'a,b,c\n1,,3\n4,5,6'
+   In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   In [7]: df.loc[0, 'b']
+   Out[7]:
+   'nan'
+
+Current Behavior:
+
+.. ipython:: python
+
+   data = 'a,b,c\n1,,3\n4,5,6'
+   df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
+   df.loc[0, 'b']
+
+Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
+
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
@@ -972,6 +1011,7 @@ Reshaping
 - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)
 - Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`)
 - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`)
+- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`)
 
 .. _whatsnew_0240.bug_fixes.sparse:
 
@@ -985,6 +1025,7 @@ Sparse
 - Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array.
 - Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`)
 - Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`)
+- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
 
 Build Changes
 ^^^^^^^^^^^^^
 
@@ -16,33 +16,30 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 
 {{py:
 
-# name, c_type, dest_type, dest_dtype
-dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'),
-          ('float32', 'float32_t', 'float32_t', 'np.float32'),
-          ('int8',  'int8_t',  'float32_t', 'np.float32'),
-          ('int16', 'int16_t', 'float32_t', 'np.float32'),
-          ('int32', 'int32_t', 'float64_t', 'np.float64'),
-          ('int64', 'int64_t', 'float64_t', 'np.float64')]
+# name, c_type, dest_type
+dtypes = [('float64', 'float64_t', 'float64_t'),
+          ('float32', 'float32_t', 'float32_t'),
+          ('int8',  'int8_t',  'float32_t'),
+          ('int16', 'int16_t', 'float32_t'),
+          ('int32', 'int32_t', 'float64_t'),
+          ('int64', 'int64_t', 'float64_t')]
 
 def get_dispatch(dtypes):
 
-    for name, c_type, dest_type, dest_dtype, in dtypes:
-
-        dest_type2 = dest_type
-        dest_type = dest_type.replace('_t', '')
-
-        yield name, c_type, dest_type, dest_type2, dest_dtype
+    for name, c_type, dest_type, in dtypes:
+        dest_name = dest_type[:-2]  # i.e. strip "_t"
+        yield name, c_type, dest_type, dest_name
 
 }}
 
-{{for name, c_type, dest_type, dest_type2, dest_dtype
+{{for name, c_type, dest_type, dest_name
       in get_dispatch(dtypes)}}
 
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
-                     ndarray[{{dest_type2}}, ndim=2] out,
+                     ndarray[{{dest_type}}, ndim=2] out,
                      Py_ssize_t periods, int axis):
     cdef:
         Py_ssize_t i, j, sx, sy
@@ -84,9 +81,9 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
                     out[i, j] = arr[i, j] - arr[i, j - periods]
 
 
-def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
+def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
                                  ndarray[int64_t] indexer, Py_ssize_t loc,
-                                 ndarray[{{dest_type2}}] out):
+                                 ndarray[{{dest_type}}] out):
     cdef:
         Py_ssize_t i, j, k
 
 
@@ -131,45 +131,20 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
     argsorted = _as.astype('i8')
 
     {{if dtype == 'object'}}
-    for i in range(n):
-        sum_ranks += i + 1
-        dups += 1
-        isnan = sorted_mask[i]
-        val = util.get_value_at(sorted_data, i)
-
-        if isnan and keep_na:
-            ranks[argsorted[i]] = nan
-            continue
-        count += 1.0
-
-        if (i == n - 1 or
-                are_diff(util.get_value_at(sorted_data, i + 1), val) or
-                i == non_na_idx):
-            if tiebreak == TIEBREAK_AVERAGE:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = sum_ranks / dups
-            elif tiebreak == TIEBREAK_MIN:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = i - dups + 2
-            elif tiebreak == TIEBREAK_MAX:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = i + 1
-            elif tiebreak == TIEBREAK_FIRST:
-                raise ValueError('first not supported for non-numeric data')
-            elif tiebreak == TIEBREAK_FIRST_DESCENDING:
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = 2 * i - j - dups + 2
-            elif tiebreak == TIEBREAK_DENSE:
-                total_tie_count += 1
-                for j in range(i - dups + 1, i + 1):
-                    ranks[argsorted[j]] = total_tie_count
-            sum_ranks = dups = 0
+    if True:
     {{else}}
     with nogil:
+    {{endif}}
+        # TODO: why does the 2d version not have a nogil block?
         for i in range(n):
             sum_ranks += i + 1
             dups += 1
+
+            {{if dtype == 'object'}}
+            val = util.get_value_at(sorted_data, i)
+            {{else}}
             val = sorted_data[i]
+            {{endif}}
 
             {{if dtype != 'uint64'}}
             isnan = sorted_mask[i]
@@ -180,8 +155,14 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
 
             count += 1.0
 
-            if (i == n - 1 or sorted_data[i + 1] != val or
-                i == non_na_idx):
+            {{if dtype == 'object'}}
+            if (i == n - 1 or
+                    are_diff(util.get_value_at(sorted_data, i + 1), val) or
+                    i == non_na_idx):
+            {{else}}
+            if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
+            {{endif}}
+
                 if tiebreak == TIEBREAK_AVERAGE:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = sum_ranks / dups
@@ -192,8 +173,13 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = i + 1
                 elif tiebreak == TIEBREAK_FIRST:
+                    {{if dtype == 'object'}}
+                    raise ValueError('first not supported for '
+                                     'non-numeric data')
+                    {{else}}
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = j + 1
+                    {{endif}}
                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = 2 * i - j - dups + 2
@@ -202,7 +188,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
                     for j in range(i - dups + 1, i + 1):
                         ranks[argsorted[j]] = total_tie_count
                 sum_ranks = dups = 0
-    {{endif}}
     if pct:
         if tiebreak == TIEBREAK_DENSE:
             return ranks / total_tie_count