pandas-dev
diff --git a/‎Makefile
+1 b/‎Makefile
+1
diff --git a/‎asv_bench/benchmarks/categoricals.py
+17 b/‎asv_bench/benchmarks/categoricals.py
+17
diff --git a/‎asv_bench/benchmarks/frame_methods.py
+18 b/‎asv_bench/benchmarks/frame_methods.py
+18
diff --git a/‎asv_bench/benchmarks/groupby.py
+20-1 b/‎asv_bench/benchmarks/groupby.py
+20-1
diff --git a/‎ci/environment-dev.yaml
+1 b/‎ci/environment-dev.yaml
+1
diff --git a/‎ci/requirements_dev.txt
+1 b/‎ci/requirements_dev.txt
+1
diff --git a/‎doc/make.py
+15-2 b/‎doc/make.py
+15-2
diff --git a/‎doc/source/_static/favicon.ico
3.81 KB b/‎doc/source/_static/favicon.ico
3.81 KB
diff --git a/‎doc/source/advanced.rst
+2-2 b/‎doc/source/advanced.rst
+2-2
diff --git a/‎doc/source/api.rst
+3-3 b/‎doc/source/api.rst
+3-3
diff --git a/‎doc/source/basics.rst
+29-30 b/‎doc/source/basics.rst
+29-30
diff --git a/‎doc/source/categorical.rst
+5-5 b/‎doc/source/categorical.rst
+5-5
diff --git a/‎doc/source/comparison_with_r.rst
+5-5 b/‎doc/source/comparison_with_r.rst
+5-5
@@ -23,3 +23,4 @@ doc:
 	cd doc; \
 	python make.py clean; \
 	python make.py html
+	python make.py spellcheck
@@ -193,3 +193,20 @@ def time_categorical_series_is_monotonic_increasing(self):
 
     def time_categorical_series_is_monotonic_decreasing(self):
         self.s.is_monotonic_decreasing
+
+
+class Contains(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        N = 10**5
+        self.ci = tm.makeCategoricalIndex(N)
+        self.c = self.ci.values
+        self.key = self.ci.categories[0]
+
+    def time_categorical_index_contains(self):
+        self.key in self.ci
+
+    def time_categorical_contains(self):
+        self.key in self.c
@@ -512,3 +512,21 @@ def time_nlargest(self, keep):
 
     def time_nsmallest(self, keep):
         self.df.nsmallest(100, 'A', keep=keep)
+
+
+class Describe(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        self.df = DataFrame({
+            'a': np.random.randint(0, 100, int(1e6)),
+            'b': np.random.randint(0, 100, int(1e6)),
+            'c': np.random.randint(0, 100, int(1e6))
+        })
+
+    def time_series_describe(self):
+        self.df['a'].describe()
+
+    def time_dataframe_describe(self):
+        self.df.describe()
@@ -5,7 +5,7 @@
 
 import numpy as np
 from pandas import (DataFrame, Series, MultiIndex, date_range, period_range,
-                    TimeGrouper, Categorical)
+                    TimeGrouper, Categorical, Timestamp)
 import pandas.util.testing as tm
 
 from .pandas_vb_common import setup  # noqa
@@ -385,6 +385,25 @@ def time_dtype_as_field(self, dtype, method, application):
         self.as_field_method()
 
 
+class RankWithTies(object):
+    # GH 21237
+    goal_time = 0.2
+    param_names = ['dtype', 'tie_method']
+    params = [['float64', 'float32', 'int64', 'datetime64'],
+              ['first', 'average', 'dense', 'min', 'max']]
+
+    def setup(self, dtype, tie_method):
+        N = 10**4
+        if dtype == 'datetime64':
+            data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
+        else:
+            data = np.array([1] * N, dtype=dtype)
+        self.df = DataFrame({'values': data, 'key': ['foo'] * N})
+
+    def time_rank_ties(self, dtype, tie_method):
+        self.df.groupby('key').rank(method=tie_method)
+
+
 class Float32(object):
     # GH 13335
     goal_time = 0.2
 
@@ -13,3 +13,4 @@ dependencies:
   - pytz
   - setuptools>=24.2.0
   - sphinx
+  - sphinxcontrib-spelling
@@ -9,3 +9,4 @@ python-dateutil>=2.5.0
 pytz
 setuptools>=24.2.0
 sphinx
+sphinxcontrib-spelling
@@ -224,8 +224,9 @@ def _sphinx_build(self, kind):
         --------
         >>> DocBuilder(num_jobs=4)._sphinx_build('html')
         """
-        if kind not in ('html', 'latex'):
-            raise ValueError('kind must be html or latex, not {}'.format(kind))
+        if kind not in ('html', 'latex', 'spelling'):
+            raise ValueError('kind must be html, latex or '
+                             'spelling, not {}'.format(kind))
 
         self._run_os('sphinx-build',
                      '-j{}'.format(self.num_jobs),
@@ -304,6 +305,18 @@ def zip_html(self):
                      '-q',
                      *fnames)
 
+    def spellcheck(self):
+        """Spell check the documentation."""
+        self._sphinx_build('spelling')
+        output_location = os.path.join('build', 'spelling', 'output.txt')
+        with open(output_location) as output:
+            lines = output.readlines()
+            if lines:
+                raise SyntaxError(
+                    'Found misspelled words.'
+                    ' Check pandas/doc/build/spelling/output.txt'
+                    ' for more details.')
+
 
 def main():
     cmds = [method for method in dir(DocBuilder) if not method.startswith('_')]
 
@@ -342,7 +342,7 @@ As usual, **both sides** of the slicers are included as this is label indexing.
                        columns=micolumns).sort_index().sort_index(axis=1)
    dfmi
 
-Basic multi-index slicing using slices, lists, and labels.
+Basic MultiIndex slicing using slices, lists, and labels.
 
 .. ipython:: python
 
@@ -1039,7 +1039,7 @@ On the other hand, if the index is not monotonic, then both slice bounds must be
     KeyError: 'Cannot get right slice bound for non-unique label: 3'
 
 :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` only check that
-an index is weakly monotonic. To check for strict montonicity, you can combine one of those with
+an index is weakly monotonic. To check for strict monotonicity, you can combine one of those with
 :meth:`Index.is_unique`
 
 .. ipython:: python
 
@@ -1200,9 +1200,9 @@ Attributes and underlying data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 **Axes**
 
-  * **items**: axis 0; each item corresponds to a DataFrame contained inside
-  * **major_axis**: axis 1; the index (rows) of each of the DataFrames
-  * **minor_axis**: axis 2; the columns of each of the DataFrames
+* **items**: axis 0; each item corresponds to a DataFrame contained inside
+* **major_axis**: axis 1; the index (rows) of each of the DataFrames
+* **minor_axis**: axis 2; the columns of each of the DataFrames
 
 .. autosummary::
    :toctree: generated/
 
@@ -50,9 +50,8 @@ Attributes and the raw ndarray(s)
 
 pandas objects have a number of attributes enabling you to access the metadata
 
-  * **shape**: gives the axis dimensions of the object, consistent with ndarray
-  * Axis labels
-
+* **shape**: gives the axis dimensions of the object, consistent with ndarray
+* Axis labels
     * **Series**: *index* (only axis)
     * **DataFrame**: *index* (rows) and *columns*
     * **Panel**: *items*, *major_axis*, and *minor_axis*
@@ -131,9 +130,9 @@ Flexible binary operations
 With binary operations between pandas data structures, there are two key points
 of interest:
 
-  * Broadcasting behavior between higher- (e.g. DataFrame) and
-    lower-dimensional (e.g. Series) objects.
-  * Missing data in computations.
+* Broadcasting behavior between higher- (e.g. DataFrame) and
+  lower-dimensional (e.g. Series) objects.
+* Missing data in computations.
 
 We will demonstrate how to manage these issues independently, though they can
 be handled simultaneously.
@@ -168,7 +167,7 @@ either match on the *index* or *columns* via the **axis** keyword:
 
    df_orig = df
 
-Furthermore you can align a level of a multi-indexed DataFrame with a Series.
+Furthermore you can align a level of a MultiIndexed DataFrame with a Series.
 
 .. ipython:: python
 
@@ -462,10 +461,10 @@ produce an object of the same size. Generally speaking, these methods take an
 **axis** argument, just like *ndarray.{sum, std, ...}*, but the axis can be
 specified by name or integer:
 
-  - **Series**: no axis argument needed
-  - **DataFrame**: "index" (axis=0, default), "columns" (axis=1)
-  - **Panel**: "items" (axis=0), "major" (axis=1, default), "minor"
-    (axis=2)
+* **Series**: no axis argument needed
+* **DataFrame**: "index" (axis=0, default), "columns" (axis=1)
+* **Panel**: "items" (axis=0), "major" (axis=1, default), "minor"
+  (axis=2)
 
 For example:
 
@@ -593,7 +592,7 @@ categorical columns:
     frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})
     frame.describe()
 
-This behaviour can be controlled by providing a list of types as ``include``/``exclude``
+This behavior can be controlled by providing a list of types as ``include``/``exclude``
 arguments. The special value ``all`` can also be used:
 
 .. ipython:: python
@@ -1034,7 +1033,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin
 Transform with multiple functions
 +++++++++++++++++++++++++++++++++
 
-Passing multiple functions will yield a column multi-indexed DataFrame.
+Passing multiple functions will yield a column MultiIndexed DataFrame.
 The first level will be the original frame column names; the second level
 will be the names of the transforming functions.
 
@@ -1060,7 +1059,7 @@ Passing a dict of functions will allow selective transforming per column.
 
    tsdf.transform({'A': np.abs, 'B': lambda x: x+1})
 
-Passing a dict of lists will generate a multi-indexed DataFrame with these
+Passing a dict of lists will generate a MultiIndexed DataFrame with these
 selective transforms.
 
 .. ipython:: python
@@ -1187,11 +1186,11 @@ It is used to implement nearly all other features relying on label-alignment
 functionality. To *reindex* means to conform the data to match a given set of
 labels along a particular axis. This accomplishes several things:
 
-  * Reorders the existing data to match a new set of labels
-  * Inserts missing value (NA) markers in label locations where no data for
-    that label existed
-  * If specified, **fill** data for missing labels using logic (highly relevant
-    to working with time series data)
+* Reorders the existing data to match a new set of labels
+* Inserts missing value (NA) markers in label locations where no data for
+  that label existed
+* If specified, **fill** data for missing labels using logic (highly relevant
+  to working with time series data)
 
 Here is a simple example:
 
@@ -1889,12 +1888,12 @@ faster than sorting the entire Series and calling ``head(n)`` on the result.
    df.nsmallest(5, ['a', 'c'])
 
 
-.. _basics.multi-index_sorting:
+.. _basics.multiindex_sorting:
 
-Sorting by a multi-index column
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Sorting by a MultiIndex column
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-You must be explicit about sorting when the column is a multi-index, and fully specify
+You must be explicit about sorting when the column is a MultiIndex, and fully specify
 all levels to ``by``.
 
 .. ipython:: python
@@ -1911,10 +1910,10 @@ the axis indexes, since they are immutable) and returns a new object. Note that
 **it is seldom necessary to copy objects**. For example, there are only a
 handful of ways to alter a DataFrame *in-place*:
 
-  * Inserting, deleting, or modifying a column.
-  * Assigning to the ``index`` or ``columns`` attributes.
-  * For homogeneous data, directly modifying the values via the ``values``
-    attribute or advanced indexing.
+* Inserting, deleting, or modifying a column.
+* Assigning to the ``index`` or ``columns`` attributes.
+* For homogeneous data, directly modifying the values via the ``values``
+  attribute or advanced indexing.
 
 To be clear, no pandas method has the side effect of modifying your data;
 almost every method returns a new object, leaving the original object
@@ -2112,22 +2111,22 @@ Because the data was transposed the original inference stored all columns as obj
 The following functions are available for one dimensional object arrays or scalars to perform
 hard conversion of objects to a specified type:
 
-- :meth:`~pandas.to_numeric` (conversion to numeric dtypes)
+* :meth:`~pandas.to_numeric` (conversion to numeric dtypes)
 
   .. ipython:: python
 
      m = ['1.1', 2, 3]
      pd.to_numeric(m)
 
-- :meth:`~pandas.to_datetime` (conversion to datetime objects)
+* :meth:`~pandas.to_datetime` (conversion to datetime objects)
 
   .. ipython:: python
 
      import datetime
      m = ['2016-07-09', datetime.datetime(2016, 3, 2)]
      pd.to_datetime(m)
 
-- :meth:`~pandas.to_timedelta` (conversion to timedelta objects)
+* :meth:`~pandas.to_timedelta` (conversion to timedelta objects)
 
   .. ipython:: python
 
 
@@ -542,11 +542,11 @@ Comparisons
 
 Comparing categorical data with other objects is possible in three cases:
 
- * Comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array,
-   ...) of the same length as the categorical data.
- * All comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to
-   another categorical Series, when ``ordered==True`` and the `categories` are the same.
- * All comparisons of a categorical data to a scalar.
+* Comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array,
+  ...) of the same length as the categorical data.
+* All comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to
+  another categorical Series, when ``ordered==True`` and the `categories` are the same.
+* All comparisons of a categorical data to a scalar.
 
 All other comparisons, especially "non-equality" comparisons of two categoricals with different
 categories or a categorical with any list-like object, will raise a ``TypeError``.
 
@@ -18,11 +18,11 @@ was started to provide a more detailed look at the `R language
 party libraries as they relate to ``pandas``. In comparisons with R and CRAN
 libraries, we care about the following things:
 
-  - **Functionality / flexibility**: what can/cannot be done with each tool
-  - **Performance**: how fast are operations. Hard numbers/benchmarks are
-    preferable
-  - **Ease-of-use**: Is one tool easier/harder to use (you may have to be
-    the judge of this, given side-by-side code comparisons)
+* **Functionality / flexibility**: what can/cannot be done with each tool
+* **Performance**: how fast are operations. Hard numbers/benchmarks are
+  preferable
+* **Ease-of-use**: Is one tool easier/harder to use (you may have to be
+  the judge of this, given side-by-side code comparisons)
 
 This page is also here to offer a bit of a translation guide for users of these
 R packages.