pandas-dev
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
+1-2 b/‎.github/PULL_REQUEST_TEMPLATE.md
+1-2
diff --git a/‎.pre-commit-config.yaml
+10-3 b/‎.pre-commit-config.yaml
+10-3
diff --git a/‎asv_bench/benchmarks/arithmetic.py
+5-5 b/‎asv_bench/benchmarks/arithmetic.py
+5-5
diff --git a/‎asv_bench/benchmarks/frame_methods.py
+4-4 b/‎asv_bench/benchmarks/frame_methods.py
+4-4
diff --git a/‎asv_bench/benchmarks/groupby.py
+3 b/‎asv_bench/benchmarks/groupby.py
+3
diff --git a/‎asv_bench/benchmarks/hash_functions.py
+2-2 b/‎asv_bench/benchmarks/hash_functions.py
+2-2
diff --git a/‎asv_bench/benchmarks/inference.py
+1-1 b/‎asv_bench/benchmarks/inference.py
+1-1
diff --git a/‎asv_bench/benchmarks/join_merge.py
+1-1 b/‎asv_bench/benchmarks/join_merge.py
+1-1
diff --git a/‎asv_bench/benchmarks/rolling.py
+2-2 b/‎asv_bench/benchmarks/rolling.py
+2-2
diff --git a/‎asv_bench/benchmarks/series_methods.py
+1-1 b/‎asv_bench/benchmarks/series_methods.py
+1-1
diff --git a/‎asv_bench/benchmarks/timeseries.py
+2-2 b/‎asv_bench/benchmarks/timeseries.py
+2-2
diff --git a/‎doc/source/getting_started/comparison/comparison_with_sas.rst
+7-67 b/‎doc/source/getting_started/comparison/comparison_with_sas.rst
+7-67
diff --git a/‎doc/source/getting_started/comparison/comparison_with_spreadsheets.rst
+1-1 b/‎doc/source/getting_started/comparison/comparison_with_spreadsheets.rst
+1-1
diff --git a/‎doc/source/getting_started/comparison/comparison_with_sql.rst
+3-18 b/‎doc/source/getting_started/comparison/comparison_with_sql.rst
+3-18
@@ -1,5 +1,4 @@
 - [ ] closes #xxxx
 - [ ] tests added / passed
-- [ ] passes `black pandas`
-- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
+- [ ] Ensure all linting tests pass, see [here](https://pandas.pydata.org/pandas-docs/dev/development/contributing.html#code-standards) for how to run them
 - [ ] whatsnew entry
@@ -1,4 +1,5 @@
 minimum_pre_commit_version: 2.9.2
+exclude: ^LICENSES/|\.(html|csv|svg)$
 repos:
 -   repo: https://github.com/python/black
     rev: 20.8b1
@@ -121,6 +122,13 @@ repos:
         entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module"
         types: [python]
         exclude: ^(asv_bench|pandas/tests|doc)/
+    -   id: unwanted-patterns-bare-pytest-raises
+        name: Check for use of bare pytest raises
+        language: python
+        entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises"
+        types: [python]
+        files: ^pandas/tests/
+        exclude: ^pandas/tests/(computation|extension|io)/
     -   id: inconsistent-namespace-usage
         name: 'Check for inconsistent use of pandas namespace in tests'
         entry: python scripts/check_for_inconsistent_pandas_namespace.py
@@ -137,7 +145,7 @@ repos:
         name: Check for use of foo.__class__ instead of type(foo)
         entry: \.__class__
         language: pygrep
-        files: \.(py|pyx)$
+        types_or: [python, cython]
     -   id: unwanted-typing
         name: Check for use of comment-based annotation syntax and missing error codes
         entry: |
@@ -165,9 +173,8 @@ repos:
     rev: v3.4.0
     hooks:
     -   id: end-of-file-fixer
-        exclude: ^LICENSES/|\.(html|csv|txt|svg|py)$
+        exclude: \.txt$
     -   id: trailing-whitespace
-        exclude: \.(html|svg)$
 -   repo: https://github.com/codespell-project/codespell
     rev: v2.0.0
     hooks:
 
@@ -122,18 +122,18 @@ def setup(self, op):
         n_rows = 500
 
         # construct dataframe with 2 blocks
-        arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8")
-        arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4")
+        arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8")
+        arr2 = np.random.randn(n_rows, n_cols // 2).astype("f4")
         df = pd.concat(
             [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True
         )
         # should already be the case, but just to be sure
         df._consolidate_inplace()
 
         # TODO: GH#33198 the setting here shoudlnt need two steps
-        arr1 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
-        arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("i8")
-        arr3 = np.random.randn(n_rows, int(n_cols / 4)).astype("f8")
+        arr1 = np.random.randn(n_rows, n_cols // 4).astype("f8")
+        arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
+        arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")
         df2 = pd.concat(
             [pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
             axis=1,
 
@@ -263,7 +263,7 @@ class Repr:
     def setup(self):
         nrows = 10000
         data = np.random.randn(nrows, 10)
-        arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100)
+        arrays = np.tile(np.random.randn(3, nrows // 100), 100)
         idx = MultiIndex.from_arrays(arrays)
         self.df3 = DataFrame(data, index=idx)
         self.df4 = DataFrame(data, index=np.random.randn(nrows))
@@ -648,9 +648,9 @@ class Describe:
     def setup(self):
         self.df = DataFrame(
             {
-                "a": np.random.randint(0, 100, int(1e6)),
-                "b": np.random.randint(0, 100, int(1e6)),
-                "c": np.random.randint(0, 100, int(1e6)),
+                "a": np.random.randint(0, 100, 10 ** 6),
+                "b": np.random.randint(0, 100, 10 ** 6),
+                "c": np.random.randint(0, 100, 10 ** 6),
             }
         )
 
 
@@ -126,6 +126,9 @@ def setup(self, data, key):
     def time_series_groups(self, data, key):
         self.ser.groupby(self.ser).groups
 
+    def time_series_indices(self, data, key):
+        self.ser.groupby(self.ser).indices
+
 
 class GroupManyLabels:
 
 
@@ -103,9 +103,9 @@ class Float64GroupIndex:
     # GH28303
     def setup(self):
         self.df = pd.date_range(
-            start="1/1/2018", end="1/2/2018", periods=1e6
+            start="1/1/2018", end="1/2/2018", periods=10 ** 6
         ).to_frame()
-        self.group_index = np.round(self.df.index.astype(int) / 1e9)
+        self.group_index = np.round(self.df.index.astype(int) / 10 ** 9)
 
     def time_groupby(self):
         self.df.groupby(self.group_index).last()
 
@@ -42,7 +42,7 @@ class ToNumericDowncast:
     ]
 
     N = 500000
-    N2 = int(N / 2)
+    N2 = N // 2
 
     data_dict = {
         "string-int": ["1"] * N2 + [2] * N2,
 
@@ -158,7 +158,7 @@ def setup(self):
         daily_dates = date_index.to_period("D").to_timestamp("S", "S")
         self.fracofday = date_index.values - daily_dates.values
         self.fracofday = self.fracofday.astype("timedelta64[ns]")
-        self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0
+        self.fracofday = self.fracofday.astype(np.float64) / 86_400_000_000_000
         self.fracofday = Series(self.fracofday, daily_dates)
         index = date_range(date_index.min(), date_index.max(), freq="D")
         self.temp = Series(1.0, index)[self.fracofday.index]
 
@@ -171,7 +171,7 @@ class PeakMemFixedWindowMinMax:
     params = ["min", "max"]
 
     def setup(self, operation):
-        N = int(1e6)
+        N = 10 ** 6
         arr = np.random.random(N)
         self.roll = pd.Series(arr).rolling(2)
 
@@ -233,7 +233,7 @@ class GroupbyLargeGroups:
 
     def setup(self):
         N = 100000
-        self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})
+        self.df = pd.DataFrame({"A": [1, 2] * (N // 2), "B": np.random.randn(N)})
 
     def time_rolling_multiindex_creation(self):
         self.df.groupby("A").rolling(3).mean()
 
@@ -284,7 +284,7 @@ def time_dir_strings(self):
 class SeriesGetattr:
     # https://github.com/pandas-dev/pandas/issues/19764
     def setup(self):
-        self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6)))
+        self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=10 ** 6))
 
     def time_series_datetimeindex_repr(self):
         getattr(self.s, "a", None)
 
@@ -346,7 +346,7 @@ def time_iso8601_tz_spaceformat(self):
 class ToDatetimeNONISO8601:
     def setup(self):
         N = 10000
-        half = int(N / 2)
+        half = N // 2
         ts_string_1 = "March 1, 2018 12:00:00+0400"
         ts_string_2 = "March 1, 2018 12:00:00+0500"
         self.same_offset = [ts_string_1] * N
@@ -376,7 +376,7 @@ def setup(self):
         self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N
         self.diff_offset = [
             f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10)
-        ] * int(N / 10)
+        ] * (N // 10)
 
     def time_exact(self):
         to_datetime(self.s2, format="%d%b%y")
 
@@ -8,7 +8,7 @@ For potential users coming from `SAS <https://en.wikipedia.org/wiki/SAS_(softwar
 this page is meant to demonstrate how different SAS operations would be
 performed in pandas.
 
-.. include:: comparison_boilerplate.rst
+.. include:: includes/introduction.rst
 
 .. note::
 
@@ -93,16 +93,7 @@ specifying the column names.
        ;
    run;
 
-A pandas ``DataFrame`` can be constructed in many different ways,
-but for a small number of values, it is often convenient to specify it as
-a Python dictionary, where the keys are the column names
-and the values are the data.
-
-.. ipython:: python
-
-   df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]})
-   df
-
+.. include:: includes/construct_dataframe.rst
 
 Reading external data
 ~~~~~~~~~~~~~~~~~~~~~
@@ -217,12 +208,7 @@ or more columns.
           DATA step begins and can also be used in PROC statements */
    run;
 
-DataFrames can be filtered in multiple ways; the most intuitive of which is using
-:ref:`boolean indexing <indexing.boolean>`
-
-.. ipython:: python
-
-   tips[tips["total_bill"] > 10].head()
+.. include:: includes/filtering.rst
 
 If/then logic
 ~~~~~~~~~~~~~
@@ -239,18 +225,7 @@ In SAS, if/then logic can be used to create new columns.
        else bucket = 'high';
    run;
 
-The same operation in pandas can be accomplished using
-the ``where`` method from ``numpy``.
-
-.. ipython:: python
-
-   tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
-   tips.head()
-
-.. ipython:: python
-   :suppress:
-
-   tips = tips.drop("bucket", axis=1)
+.. include:: includes/if_then.rst
 
 Date functionality
 ~~~~~~~~~~~~~~~~~~
@@ -278,28 +253,7 @@ functions pandas supports other Time Series features
 not available in Base SAS (such as resampling and custom offsets) -
 see the :ref:`timeseries documentation<timeseries>` for more details.
 
-.. ipython:: python
-
-   tips["date1"] = pd.Timestamp("2013-01-15")
-   tips["date2"] = pd.Timestamp("2015-02-15")
-   tips["date1_year"] = tips["date1"].dt.year
-   tips["date2_month"] = tips["date2"].dt.month
-   tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin()
-   tips["months_between"] = tips["date2"].dt.to_period("M") - tips[
-       "date1"
-   ].dt.to_period("M")
-
-   tips[
-       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]
-   ].head()
-
-.. ipython:: python
-   :suppress:
-
-   tips = tips.drop(
-       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"],
-       axis=1,
-   )
+.. include:: includes/time_date.rst
 
 Selection of columns
 ~~~~~~~~~~~~~~~~~~~~
@@ -349,14 +303,7 @@ Sorting in SAS is accomplished via ``PROC SORT``
        by sex total_bill;
    run;
 
-pandas objects have a :meth:`~DataFrame.sort_values` method, which
-takes a list of columns to sort by.
-
-.. ipython:: python
-
-   tips = tips.sort_values(["sex", "total_bill"])
-   tips.head()
-
+.. include:: includes/sorting.rst
 
 String processing
 -----------------
@@ -377,14 +324,7 @@ functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailin
    put(LENGTHC(time));
    run;
 
-Python determines the length of a character string with the ``len`` function.
-``len`` includes trailing blanks.  Use ``len`` and ``rstrip`` to exclude
-trailing blanks.
-
-.. ipython:: python
-
-   tips["time"].str.len().head()
-   tips["time"].str.rstrip().str.len().head()
+.. include:: includes/length.rst
 
 
 Find
 
@@ -14,7 +14,7 @@ terminology and link to documentation for Excel, but much will be the same/simil
 `Apple Numbers <https://www.apple.com/mac/numbers/compatibility/functions.html>`_, and other
 Excel-compatible spreadsheet software.
 
-.. include:: comparison_boilerplate.rst
+.. include:: includes/introduction.rst
 
 Data structures
 ---------------
 
@@ -8,7 +8,7 @@ Since many potential pandas users have some familiarity with
 `SQL <https://en.wikipedia.org/wiki/SQL>`_, this page is meant to provide some examples of how
 various SQL operations would be performed using pandas.
 
-.. include:: comparison_boilerplate.rst
+.. include:: includes/introduction.rst
 
 Most of the examples will utilize the ``tips`` dataset found within pandas tests.  We'll read
 the data into a DataFrame called ``tips`` and assume we have a database table of the same name and
@@ -65,24 +65,9 @@ Filtering in SQL is done via a WHERE clause.
 
     SELECT *
     FROM tips
-    WHERE time = 'Dinner'
-    LIMIT 5;
-
-DataFrames can be filtered in multiple ways; the most intuitive of which is using
-:ref:`boolean indexing <indexing.boolean>`
-
-.. ipython:: python
-
-    tips[tips["time"] == "Dinner"].head(5)
-
-The above statement is simply passing a ``Series`` of True/False objects to the DataFrame,
-returning all rows with True.
-
-.. ipython:: python
+    WHERE time = 'Dinner';
 
-    is_dinner = tips["time"] == "Dinner"
-    is_dinner.value_counts()
-    tips[is_dinner].head(5)
+.. include:: includes/filtering.rst
 
 Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and &
 (AND).
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ class ToNumericDowncast:`
`42`	`42`	`]`
`43`	`43`
`44`	`44`	`N = 500000`
`45`		`- N2 = int(N / 2)`
	`45`	`+ N2 = N // 2`
`46`	`46`
`47`	`47`	`data_dict = {`
`48`	`48`	`"string-int": ["1"] * N2 + [2] * N2,`