pandas-dev · bashtage · Feb 12, 2014 · Feb 12, 2014 · Feb 12, 2014 · Feb 12, 2014
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -105,6 +105,8 @@ API Changes
 - ``NameResolutionError`` was removed because it isn't necessary anymore.
 - ``concat`` will now concatenate mixed Series and DataFrames using the Series name
   or numbering columns as needed (:issue:`2385`)
+- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
+  longer change type of the resulting index (:issue:`6440`).
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
@@ -125,6 +127,7 @@ Improvements to existing features
 - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
 - Testing statements updated to use specialized asserts (:issue:`6175`)
 - ``Series.rank()`` now has a percentage rank option (:issue:`5971`)
+- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`)
 - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
   using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
 - perf improvements in DataFrame construction with certain offsets, by removing faulty caching
@@ -191,6 +194,10 @@ Bug Fixes
 - Bug in ``read_html`` tests where redirected invalid URLs would make one test
   fail (:issue:`6445`).
 - Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`)
+- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`)
+- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases (:issue:`6335`)
+- Bug in ``DataFrame.to_stata`` which exported using he wrong data types and missing values (:issue:`6335`)
+
 
 pandas 0.13.1
 -------------

diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -78,6 +78,21 @@ These are out-of-bounds selections
 - ``NameResolutionError`` was removed because it isn't necessary anymore.
 - ``concat`` will now concatenate mixed Series and DataFrames using the Series name
   or numbering columns as needed (:issue:`2385`). See :ref:`the docs <merging.mixed_ndims>`
+- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
+  longer change type of the resulting index (:issue:`6440`)
+
+  .. ipython:: python
+
+     i = pd.Index([1, 2, 3, 'a' , 'b', 'c'])
+     i[[0,1,2]]
+
+  Previously, the above operation would return ``Int64Index``.  If you'd like
+  to do this manually, use :meth:`Index.astype`
+
+  .. ipython:: python
+
+     i[[0,1,2]].astype(np.int_)
+
 
 MultiIndexing Using Slicers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -233,6 +248,9 @@ Enhancements
   using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
 - Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex``
   to convert to the Julian Date used primarily in astronomy. (:issue:`4041`)
+- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types
+  and will upcast when needed.  When it isn't possibly to losslessly upcast, a warning
+  is raised (:issue:`6327`)
 
 Performance
 ~~~~~~~~~~~

diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -68,12 +68,14 @@ cdef:
     int TIEBREAK_MAX = 2
     int TIEBREAK_FIRST = 3
     int TIEBREAK_FIRST_DESCENDING = 4
+    int TIEBREAK_DENSE = 5
 
 tiebreakers = {
     'average' : TIEBREAK_AVERAGE,
     'min' : TIEBREAK_MIN,
     'max' : TIEBREAK_MAX,
-    'first' : TIEBREAK_FIRST
+    'first' : TIEBREAK_FIRST,
+    'dense' : TIEBREAK_DENSE,
 }
 
 
@@ -137,7 +139,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
     """
 
     cdef:
-        Py_ssize_t i, j, n, dups = 0
+        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
         ndarray[float64_t] sorted_data, ranks, values
         ndarray[int64_t] argsorted
         float64_t val, nan_value
@@ -200,6 +202,10 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
             elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = 2 * i - j - dups + 2
+            elif tiebreak == TIEBREAK_DENSE:
+                total_tie_count += 1
+                for j in range(i - dups + 1, i + 1):
+                    ranks[argsorted[j]] = total_tie_count
             sum_ranks = dups = 0
     if pct:
         return ranks / count
@@ -214,7 +220,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
     """
 
     cdef:
-        Py_ssize_t i, j, n, dups = 0
+        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
         ndarray[int64_t] sorted_data, values
         ndarray[float64_t] ranks
         ndarray[int64_t] argsorted
@@ -265,6 +271,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
             elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                 for j in range(i - dups + 1, i + 1):
                     ranks[argsorted[j]] = 2 * i - j - dups + 2
+            elif tiebreak == TIEBREAK_DENSE:
+                total_tie_count += 1
+                for j in range(i - dups + 1, i + 1):
+                    ranks[argsorted[j]] = total_tie_count
             sum_ranks = dups = 0
     if pct:
         return ranks / count
@@ -279,7 +289,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
     """
 
     cdef:
-        Py_ssize_t i, j, z, k, n, dups = 0
+        Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
         ndarray[float64_t, ndim=2] ranks, values
         ndarray[int64_t, ndim=2] argsorted
         float64_t val, nan_value
@@ -324,6 +334,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
 
     for i in range(n):
         dups = sum_ranks = 0
+        total_tie_count = 0
         for j in range(k):
             sum_ranks += j + 1
             dups += 1
@@ -347,6 +358,10 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
+                elif tiebreak == TIEBREAK_DENSE:
+                    total_tie_count += 1
+                    for z in range(j - dups + 1, j + 1):
+                        ranks[i, argsorted[i, z]] = total_tie_count
                 sum_ranks = dups = 0
 
     if axis == 0:
@@ -362,7 +377,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
     """
 
     cdef:
-        Py_ssize_t i, j, z, k, n, dups = 0
+        Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
         ndarray[float64_t, ndim=2] ranks
         ndarray[int64_t, ndim=2] argsorted
         ndarray[int64_t, ndim=2, cast=True] values
@@ -395,6 +410,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
 
     for i in range(n):
         dups = sum_ranks = 0
+        total_tie_count = 0
         for j in range(k):
             sum_ranks += j + 1
             dups += 1
@@ -415,6 +431,10 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
                     for z in range(j - dups + 1, j + 1):
                         ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
+                elif tiebreak == TIEBREAK_DENSE:
+                    total_tie_count += 1
+                    for z in range(j - dups + 1, j + 1):
+                        ranks[i, argsorted[i, z]] = total_tie_count
                 sum_ranks = dups = 0
 
     if axis == 0:
@@ -430,7 +450,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
     """
 
     cdef:
-        Py_ssize_t i, j, n, dups = 0
+        Py_ssize_t i, j, n, dups = 0, total_tie_count = 0
         ndarray[float64_t] ranks
         ndarray sorted_data, values
         ndarray[int64_t] argsorted
@@ -502,6 +522,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
                     ranks[argsorted[j]] = i + 1
             elif tiebreak == TIEBREAK_FIRST:
                 raise ValueError('first not supported for non-numeric data')
+            elif tiebreak == TIEBREAK_DENSE:
+                total_tie_count += 1
+                for j in range(i - dups + 1, i + 1):
+                    ranks[argsorted[j]] = total_tie_count
             sum_ranks = dups = 0
     if pct:
         ranks / count
@@ -545,6 +569,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
 
     cdef:
         Py_ssize_t i, j, z, k, n, infs, dups = 0
+        Py_ssize_t total_tie_count = 0
         ndarray[float64_t, ndim=2] ranks
         ndarray[object, ndim=2] values
         ndarray[int64_t, ndim=2] argsorted
@@ -600,6 +625,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
 
     for i in range(n):
         dups = sum_ranks = infs = 0
+        total_tie_count = 0
         for j in range(k):
             val = values[i, j]
             if val is nan_value and keep_na:
@@ -621,6 +647,10 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
                 elif tiebreak == TIEBREAK_FIRST:
                     raise ValueError('first not supported for '
                                      'non-numeric data')
+                elif tiebreak == TIEBREAK_DENSE:
+                    total_tie_count += 1
+                    for z in range(j - dups + 1, j + 1):
+                        ranks[i, argsorted[i, z]] = total_tie_count
                 sum_ranks = dups = 0
 
     if axis == 0:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4182,11 +4182,12 @@ def rank(self, axis=0, numeric_only=None, method='average',
             Ranks over columns (0) or rows (1)
         numeric_only : boolean, default None
             Include only float, int, boolean data
-        method : {'average', 'min', 'max', 'first'}
+        method : {'average', 'min', 'max', 'first', 'dense'}
             * average: average rank of group
             * min: lowest rank in group
             * max: highest rank in group
             * first: ranks assigned in order they appear in the array
+            * dense: like 'min', but rank always increases by 1 between groups
         na_option : {'keep', 'top', 'bottom'}
             * keep: leave NA values where they are
             * top: smallest rank if ascending

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -631,34 +631,35 @@ def __hash__(self):
         raise TypeError("unhashable type: %r" % type(self).__name__)
 
     def __getitem__(self, key):
-        """Override numpy.ndarray's __getitem__ method to work as desired"""
-        arr_idx = self.view(np.ndarray)
+        """
+        Override numpy.ndarray's __getitem__ method to work as desired.
+
+        This function adds lists and Series as valid boolean indexers
+        (ndarrays only supports ndarray with dtype=bool).
+
+        If resulting ndim != 1, plain ndarray is returned instead of
+        corresponding `Index` subclass.
+
+        """
+        # There's no custom logic to be implemented in __getslice__, so it's
+        # not overloaded intentionally.
+        __getitem__ = super(Index, self).__getitem__
         if np.isscalar(key):
-            return arr_idx[key]
-        else:
-            if com._is_bool_indexer(key):
-                key = np.asarray(key)
+            return __getitem__(key)
 
-            try:
-                result = arr_idx[key]
-                if result.ndim > 1:
-                    return result
-            except (IndexError):
-                if not len(key):
-                    result = []
-                else:
-                    raise
+        if isinstance(key, slice):
+            # This case is separated from the conditional above to avoid
+            # pessimization of basic indexing.
+            return __getitem__(key)
 
-            return Index(result, name=self.name)
+        if com._is_bool_indexer(key):
+            return __getitem__(np.asarray(key))
 
-    def _getitem_slice(self, key):
-        """ getitem for a bool/sliceable, fallback to standard getitem """
-        try:
-            arr_idx = self.view(np.ndarray)
-            result = arr_idx[key]
-            return self.__class__(result, name=self.name, fastpath=True)
-        except:
-            return self.__getitem__(key)
+        result = __getitem__(key)
+        if result.ndim > 1:
+            return result.view(np.ndarray)
+        else:
+            return result
 
     def append(self, other):
         """
@@ -2800,8 +2801,6 @@ def __getitem__(self, key):
 
             return result
 
-    _getitem_slice = __getitem__
-
     def take(self, indexer, axis=None):
         """
         Analogous to ndarray.take

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False):
         if raise_on_error:
             _check_slice_bounds(slobj, self.index)
         return self.__class__(self._block._slice(slobj),
-                              self.index._getitem_slice(slobj), fastpath=True)
+                              self.index[slobj], fastpath=True)
 
     def set_axis(self, axis, value, maybe_rename=True, check_axis=True):
         cur_axis, value = self._set_axis(axis, value, check_axis)

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1720,11 +1720,12 @@ def rank(self, method='average', na_option='keep', ascending=True,
 
         Parameters
         ----------
-        method : {'average', 'min', 'max', 'first'}
+        method : {'average', 'min', 'max', 'first', 'dense'}
             * average: average rank of group
             * min: lowest rank in group
             * max: highest rank in group
             * first: ranks assigned in order they appear in the array
+            * dense: like 'min', but rank always increases by 1 between groups
         na_option : {'keep'}
             keep: leave NA values where they are
         ascending : boolean, default True