Merge remote-tracking branch 'upstream/master' into str_cat_err

h-vetinari · h-vetinari · commit 48ff187ff072 · 2019-06-01T21:44:09.000+02:00
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -94,6 +94,12 @@ def time_min(self):
     def time_min_trivial(self):
         self.idx_inc.min()
 
+    def time_get_loc_inc(self):
+        self.idx_inc.get_loc(900000)
+
+    def time_get_loc_dec(self):
+        self.idx_dec.get_loc(100000)
+
 
 class IndexAppend:
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -116,63 +116,3 @@ jobs:
       fi
     displayName: 'Running benchmarks'
     condition: true
-
-- job: 'Docs'
-  pool:
-    vmImage: ubuntu-16.04
-  timeoutInMinutes: 90
-  steps:
-  - script: |
-      echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev'
-      echo '##vso[task.setvariable variable=ENV_FILE]environment.yml'
-    displayName: 'Setting environment variables'
-
-  - script: |
-      export PATH=$HOME/miniconda3/bin:$PATH
-      sudo apt-get install -y libc6-dev-i386
-      ci/setup_env.sh
-    displayName: 'Setup environment and build pandas'
-
-  - script: |
-      export PATH=$HOME/miniconda3/bin:$PATH
-      source activate pandas-dev
-      doc/make.py
-    displayName: 'Build documentation'
-
-  - script: |
-      cd doc/build/html
-      git init
-      touch .nojekyll
-      git add --all .
-      git config user.email "pandas-dev@python.org"
-      git config user.name "pandas-docs-bot"
-      git commit -m "pandas documentation in master"
-    displayName: 'Create git repo for docs build'
-    condition : |
-      and(not(eq(variables['Build.Reason'], 'PullRequest')),
-          eq(variables['Build.SourceBranch'], 'refs/heads/master'))
-
-  # This task to work requires next steps:
-  # 1. Got to "Library > Secure files" in the azure-pipelines dashboard: https://dev.azure.com/pandas-dev/pandas/_library?itemType=SecureFiles
-  # 2. Click on "+ Secure file"
-  # 3. Upload the private key (the name of the file must match with the specified in "sshKeySecureFile" input below, "pandas_docs_key")
-  # 4. Click on file name after it is created, tick the box "Authorize for use in all pipelines" and save
-  # 5. The public key specified in "sshPublicKey" is the pair of the uploaded private key, and needs to be specified as a deploy key of the repo where the docs will be pushed: https://github.com/pandas-dev/pandas-dev.github.io/settings/keys
-  - task: InstallSSHKey@0
-    inputs:
-      hostName: 'github.com'
-      sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDHmz3l/EdqrgNxEUKkwDUuUcLv91unig03pYFGO/DMIgCmPdMG96zAgfnESd837Rm0wSSqylwSzkRJt5MV/TpFlcVifDLDQmUhqCeO8Z6dLl/oe35UKmyYICVwcvQTAaHNnYRpKC5IUlTh0JEtw9fGlnp1Ta7U1ENBLbKdpywczElhZu+hOQ892zqOj3CwA+U2329/d6cd7YnqIKoFN9DWT3kS5K6JE4IoBfQEVekIOs23bKjNLvPoOmi6CroAhu/K8j+NCWQjge5eJf2x/yTnIIP1PlEcXoHIr8io517posIx3TBup+CN8bNS1PpDW3jyD3ttl1uoBudjOQrobNnJeR6Rn67DRkG6IhSwr3BWj8alwUG5mTdZzwV5Pa9KZFdIiqX7NoDGg+itsR39QCn0thK8lGRNSR8KrWC1PSjecwelKBO7uQ7rnk/rkrZdBWR4oEA8YgNH8tirUw5WfOr5a0AIaJicKxGKNdMxZt+zmC+bS7F4YCOGIm9KHa43RrKhoGRhRf9fHHHKUPwFGqtWG4ykcUgoamDOURJyepesBAO3FiRE9rLU6ILbB3yEqqoekborHmAJD5vf7PWItW3Q/YQKuk3kkqRcKnexPyzyyq5lUgTi8CxxZdaASIOu294wjBhhdyHlXEkVTNJ9JKkj/obF+XiIIp0cBDsOXY9hDQ== pandas-dev@python.org'
-      sshKeySecureFile: 'pandas_docs_key'
-    displayName: 'Install GitHub ssh deployment key'
-    condition : |
-      and(not(eq(variables['Build.Reason'], 'PullRequest')),
-          eq(variables['Build.SourceBranch'], 'refs/heads/master'))
-
-  - script: |
-      cd doc/build/html
-      git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git
-      git push origin master -f
-    displayName: 'Publish docs to GitHub pages'
-    condition : |
-      and(not(eq(variables['Build.Reason'], 'PullRequest')),
-          eq(variables['Build.SourceBranch'], 'refs/heads/master'))
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -493,6 +493,7 @@ Performance Improvements
 - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
   int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
 - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
+- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`)
 - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
 - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
 - Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
@@ -22,6 +22,8 @@
 from pandas.core.indexes.base import Index, _index_shared_docs
 from pandas.core.indexes.numeric import Int64Index
 
+from pandas.io.formats.printing import pprint_thing
+
 
 class RangeIndex(Int64Index):
     """
@@ -64,6 +66,8 @@ class RangeIndex(Int64Index):
     _typ = 'rangeindex'
     _engine_type = libindex.Int64Engine
 
+    # check whether self._data has benn called
+    _cached_data = None  # type: np.ndarray
     # --------------------------------------------------------------------
     # Constructors
 
@@ -164,6 +168,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None,
         for k, v in kwargs.items():
             setattr(result, k, v)
 
+        result._range = range(result._start, result._stop, result._step)
+
         result._reset_identity()
         return result
 
@@ -180,9 +186,19 @@ def _constructor(self):
         """ return the class to use for construction """
         return Int64Index
 
-    @cache_readonly
+    @property
     def _data(self):
-        return np.arange(self._start, self._stop, self._step, dtype=np.int64)
+        """
+        An int array that for performance reasons is created only when needed.
+
+        The constructed array is saved in ``_cached_data``. This allows us to
+        check if the array has been created without accessing ``_data`` and
+        triggering the construction.
+        """
+        if self._cached_data is None:
+            self._cached_data = np.arange(self._start, self._stop, self._step,
+                                          dtype=np.int64)
+        return self._cached_data
 
     @cache_readonly
     def _int64index(self):
@@ -215,6 +231,9 @@ def _format_data(self, name=None):
         # we are formatting thru the attributes
         return None
 
+    def _format_with_header(self, header, na_rep='NaN', **kwargs):
+        return header + list(map(pprint_thing, self._range))
+
     # --------------------------------------------------------------------
     @property
     def start(self):
@@ -296,6 +315,15 @@ def is_monotonic_decreasing(self):
     def has_duplicates(self):
         return False
 
+    @Appender(_index_shared_docs['get_loc'])
+    def get_loc(self, key, method=None, tolerance=None):
+        if is_integer(key) and method is None and tolerance is None:
+            try:
+                return self._range.index(key)
+            except ValueError:
+                raise KeyError(key)
+        return super().get_loc(key, method=method, tolerance=tolerance)
+
     def tolist(self):
         return list(range(self._start, self._stop, self._step))
 
diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py
@@ -241,6 +241,42 @@ def test_view(self):
     def test_dtype(self):
         assert self.index.dtype == np.int64
 
+    def test_cached_data(self):
+        # GH 26565
+        # Calling RangeIndex._data caches an int64 array of the same length at
+        # self._cached_data. This tests whether _cached_data has been set.
+        idx = RangeIndex(0, 100, 10)
+
+        assert idx._cached_data is None
+
+        repr(idx)
+        assert idx._cached_data is None
+
+        str(idx)
+        assert idx._cached_data is None
+
+        idx.get_loc(20)
+        assert idx._cached_data is None
+
+        df = pd.DataFrame({'a': range(10)}, index=idx)
+
+        df.loc[50]
+        assert idx._cached_data is None
+
+        with pytest.raises(KeyError):
+            df.loc[51]
+        assert idx._cached_data is None
+
+        df.loc[10:50]
+        assert idx._cached_data is None
+
+        df.iloc[5:10]
+        assert idx._cached_data is None
+
+        # actually calling data._data
+        assert isinstance(idx._data, np.ndarray)
+        assert isinstance(idx._cached_data, np.ndarray)
+
     def test_is_monotonic(self):
         assert self.index.is_monotonic is True
         assert self.index.is_monotonic_increasing is True