apache
diff --git a/‎.github/workflows/build_and_test.yml
+1-1 b/‎.github/workflows/build_and_test.yml
+1-1
diff --git a/‎dev/lint-python
+32 b/‎dev/lint-python
+32
diff --git a/‎dev/reformat-python
+32 b/‎dev/reformat-python
+32
diff --git a/‎dev/requirements.txt
+3 b/‎dev/requirements.txt
+3
diff --git a/‎dev/tox.ini
+3-1 b/‎dev/tox.ini
+3-1
diff --git a/‎python/pyspark/pandas/accessors.py
+2-2 b/‎python/pyspark/pandas/accessors.py
+2-2
diff --git a/‎python/pyspark/pandas/base.py
+1-3 b/‎python/pyspark/pandas/base.py
+1-3
diff --git a/‎python/pyspark/pandas/config.py
+1-1 b/‎python/pyspark/pandas/config.py
+1-1
diff --git a/‎python/pyspark/pandas/data_type_ops/base.py
+3-6 b/‎python/pyspark/pandas/data_type_ops/base.py
+3-6
diff --git a/‎python/pyspark/pandas/data_type_ops/binary_ops.py
+5-3 b/‎python/pyspark/pandas/data_type_ops/binary_ops.py
+5-3
diff --git a/‎python/pyspark/pandas/data_type_ops/boolean_ops.py
+29-15 b/‎python/pyspark/pandas/data_type_ops/boolean_ops.py
+29-15
diff --git a/‎python/pyspark/pandas/data_type_ops/categorical_ops.py
+1-1 b/‎python/pyspark/pandas/data_type_ops/categorical_ops.py
+1-1
diff --git a/‎python/pyspark/pandas/data_type_ops/complex_ops.py
+9-6 b/‎python/pyspark/pandas/data_type_ops/complex_ops.py
+9-6
@@ -366,7 +366,7 @@ jobs:
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
         # Jinja2 3.0.0+ causes error when building with Sphinx.
         #   See also https://issues.apache.org/jira/browse/SPARK-35375.
-        python3.6 -m pip install flake8 pydata_sphinx_theme mypy numpydoc 'jinja2<3.0.0'
+        python3.6 -m pip install flake8 pydata_sphinx_theme mypy numpydoc 'jinja2<3.0.0' 'black==21.5b2'
     - name: Install R linter dependencies and SparkR
       run: |
         apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev
 
@@ -25,6 +25,8 @@ MINIMUM_PYCODESTYLE="2.7.0"
 
 PYTHON_EXECUTABLE="python3"
 
+BLACK_BUILD="$PYTHON_EXECUTABLE -m black"
+
 function satisfies_min_version {
     local provided_version="$1"
     local expected_version="$2"
@@ -185,6 +187,35 @@ flake8 checks failed."
     fi
 }
 
+function black_test {
+    local BLACK_REPORT=
+    local BLACK_STATUS=
+
+    # Skip check if black is not installed.
+    $BLACK_BUILD 2> /dev/null
+    if [ $? -ne 0 ]; then
+        echo "The $BLACK_BUILD command was not found. Skipping black checks for now."
+        echo
+        return
+    fi
+
+    echo "starting black test..."
+    # Black is only applied for pandas API on Spark for now.
+    BLACK_REPORT=$( ($BLACK_BUILD python/pyspark/pandas --line-length 100 --check ) 2>&1)
+    BLACK_STATUS=$?
+
+    if [ "$BLACK_STATUS" -ne 0 ]; then
+        echo "black checks failed:"
+        echo "$BLACK_REPORT"
+        echo "Please run 'dev/reformat-python' script."
+        echo "$BLACK_STATUS"
+        exit "$BLACK_STATUS"
+    else
+        echo "black checks passed."
+        echo
+    fi
+}
+
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
 
@@ -194,6 +225,7 @@ pushd "$SPARK_ROOT_DIR" &> /dev/null
 PYTHON_SOURCE="$(find . -path ./docs/.local_ruby_bundle -prune -false -o -name "*.py")"
 
 compile_python_test "$PYTHON_SOURCE"
+black_test
 pycodestyle_test "$PYTHON_SOURCE"
 flake8_test
 mypy_test
 
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The current directory of the script.
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+FWDIR="$( cd "$DIR"/.. && pwd )"
+cd "$FWDIR"
+
+BLACK_BUILD="python -m black"
+BLACK_VERSION="21.5b2"
+$BLACK_BUILD 2> /dev/null
+if [ $? -ne 0 ]; then
+    echo "The '$BLACK_BUILD' command was not found. Please install Black, for example, via 'pip install black==$BLACK_VERSION'."
+    exit 1
+fi
+
+# This script is only applied for pandas API on Spark for now.
+$BLACK_BUILD python/pyspark/pandas --line-length 100
@@ -32,3 +32,6 @@ sphinx-plotly-directive
 # Development scripts
 jira
 PyGithub
+
+# pandas API on Spark Code formatter.
+black
@@ -14,11 +14,13 @@
 # limitations under the License.
 
 [pycodestyle]
-ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504
+ignore=E203,E226,E241,E305,E402,E722,E731,E741,W503,W504
 max-line-length=100
 exclude=*/target/*,python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
 
 [flake8]
 select = E901,E999,F821,F822,F823,F401,F405,B006
+# Ignore F821 for plot documents in pandas API on Spark.
+ignore = F821
 exclude = python/docs/build/html/*,*/target/*,python/pyspark/cloudpickle/*.py,shared.py*,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*,python/out,python/pyspark/sql/pandas/functions.pyi,python/pyspark/sql/column.pyi,python/pyspark/worker.pyi,python/pyspark/java_gateway.pyi
 max-line-length = 100
@@ -48,7 +48,7 @@
 
 
 class PandasOnSparkFrameMethods(object):
-    """ pandas-on-Spark specific features for DataFrame. """
+    """pandas-on-Spark specific features for DataFrame."""
 
     def __init__(self, frame: "DataFrame"):
         self._psdf = frame
@@ -696,7 +696,7 @@ def pandas_frame_func(f, field_name):
 
 
 class PandasOnSparkSeriesMethods(object):
-    """ pandas-on-Spark specific features for Series. """
+    """pandas-on-Spark specific features for Series."""
 
     def __init__(self, series: "Series"):
         self._psser = series
 
@@ -1068,9 +1068,7 @@ def notnull(self) -> Union["Series", "Index"]:
 
         if isinstance(self, MultiIndex):
             raise NotImplementedError("notna is not defined for MultiIndex")
-        return (~self.isnull()).rename(
-            self.name  # type: ignore
-        )
+        return (~self.isnull()).rename(self.name)  # type: ignore
 
     notna = notnull
 
 
@@ -381,7 +381,7 @@ def _check_option(key: str) -> None:
 
 
 class DictWrapper:
-    """ provide attribute-style access to a nested dict"""
+    """provide attribute-style access to a nested dict"""
 
     def __init__(self, d: Dict[str, Option], prefix: str = ""):
         object.__setattr__(self, "d", d)
 
@@ -45,11 +45,7 @@
     from pyspark.pandas.series import Series  # noqa: F401 (SPARK-34943)
 
 
-def is_valid_operand_for_numeric_arithmetic(
-    operand: Any,
-    *,
-    allow_bool: bool = True
-) -> bool:
+def is_valid_operand_for_numeric_arithmetic(operand: Any, *, allow_bool: bool = True) -> bool:
     """Check whether the operand is valid for arithmetic operations against numerics."""
     if isinstance(operand, numbers.Number) and not isinstance(operand, bool):
         return True
@@ -58,7 +54,8 @@ def is_valid_operand_for_numeric_arithmetic(
             return False
         else:
             return isinstance(operand.spark.data_type, NumericType) or (
-                allow_bool and isinstance(operand.spark.data_type, BooleanType))
+                allow_bool and isinstance(operand.spark.data_type, BooleanType)
+            )
     else:
         return False
 
 
@@ -34,7 +34,7 @@ class BinaryOps(DataTypeOps):
 
     @property
     def pretty_name(self) -> str:
-        return 'binaries'
+        return "binaries"
 
     def add(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, IndexOpsMixin) and isinstance(right.spark.data_type, BinaryType):
@@ -43,11 +43,13 @@ def add(self, left, right) -> Union["Series", "Index"]:
             return column_op(F.concat)(left, F.lit(right))
         else:
             raise TypeError(
-                "Concatenation can not be applied to %s and the given type." % self.pretty_name)
+                "Concatenation can not be applied to %s and the given type." % self.pretty_name
+            )
 
     def radd(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, bytes):
             return left._with_new_scol(F.concat(F.lit(right), left.spark.column))
         else:
             raise TypeError(
-                "Concatenation can not be applied to %s and the given type." % self.pretty_name)
+                "Concatenation can not be applied to %s and the given type." % self.pretty_name
+            )
@@ -38,12 +38,13 @@ class BooleanOps(DataTypeOps):
 
     @property
     def pretty_name(self) -> str:
-        return 'booleans'
+        return "booleans"
 
     def add(self, left, right) -> Union["Series", "Index"]:
         if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
             raise TypeError(
-                "Addition can not be applied to %s and the given type." % self.pretty_name)
+                "Addition can not be applied to %s and the given type." % self.pretty_name
+            )
 
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
@@ -56,7 +57,8 @@ def add(self, left, right) -> Union["Series", "Index"]:
     def sub(self, left, right) -> Union["Series", "Index"]:
         if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
             raise TypeError(
-                "Subtraction can not be applied to %s and the given type." % self.pretty_name)
+                "Subtraction can not be applied to %s and the given type." % self.pretty_name
+            )
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return left - right
@@ -68,7 +70,8 @@ def sub(self, left, right) -> Union["Series", "Index"]:
     def mul(self, left, right) -> Union["Series", "Index"]:
         if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
             raise TypeError(
-                "Multiplication can not be applied to %s and the given type." % self.pretty_name)
+                "Multiplication can not be applied to %s and the given type." % self.pretty_name
+            )
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return left * right
@@ -80,7 +83,8 @@ def mul(self, left, right) -> Union["Series", "Index"]:
     def truediv(self, left, right) -> Union["Series", "Index"]:
         if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
             raise TypeError(
-                "True division can not be applied to %s and the given type." % self.pretty_name)
+                "True division can not be applied to %s and the given type." % self.pretty_name
+            )
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return left / right
@@ -92,7 +96,8 @@ def truediv(self, left, right) -> Union["Series", "Index"]:
     def floordiv(self, left, right) -> Union["Series", "Index"]:
         if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
             raise TypeError(
-                "Floor division can not be applied to %s and the given type." % self.pretty_name)
+                "Floor division can not be applied to %s and the given type." % self.pretty_name
+            )
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return left // right
@@ -104,7 +109,8 @@ def floordiv(self, left, right) -> Union["Series", "Index"]:
     def mod(self, left, right) -> Union["Series", "Index"]:
         if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
             raise TypeError(
-                "Modulo can not be applied to %s and the given type." % self.pretty_name)
+                "Modulo can not be applied to %s and the given type." % self.pretty_name
+            )
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return left % right
@@ -116,7 +122,8 @@ def mod(self, left, right) -> Union["Series", "Index"]:
     def pow(self, left, right) -> Union["Series", "Index"]:
         if not is_valid_operand_for_numeric_arithmetic(right, allow_bool=False):
             raise TypeError(
-                "Exponentiation can not be applied to %s and the given type." % self.pretty_name)
+                "Exponentiation can not be applied to %s and the given type." % self.pretty_name
+            )
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return left ** right
@@ -131,52 +138,59 @@ def radd(self, left, right) -> Union["Series", "Index"]:
             return right + left
         else:
             raise TypeError(
-                "Addition can not be applied to %s and the given type." % self.pretty_name)
+                "Addition can not be applied to %s and the given type." % self.pretty_name
+            )
 
     def rsub(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return right - left
         else:
             raise TypeError(
-                "Subtraction can not be applied to %s and the given type." % self.pretty_name)
+                "Subtraction can not be applied to %s and the given type." % self.pretty_name
+            )
 
     def rmul(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return right * left
         else:
             raise TypeError(
-                "Multiplication can not be applied to %s and the given type." % self.pretty_name)
+                "Multiplication can not be applied to %s and the given type." % self.pretty_name
+            )
 
     def rtruediv(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return right / left
         else:
             raise TypeError(
-                "True division can not be applied to %s and the given type." % self.pretty_name)
+                "True division can not be applied to %s and the given type." % self.pretty_name
+            )
 
     def rfloordiv(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return right // left
         else:
             raise TypeError(
-                "Floor division can not be applied to %s and the given type." % self.pretty_name)
+                "Floor division can not be applied to %s and the given type." % self.pretty_name
+            )
 
     def rpow(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return right ** left
         else:
             raise TypeError(
-                "Exponentiation can not be applied to %s and the given type." % self.pretty_name)
+                "Exponentiation can not be applied to %s and the given type." % self.pretty_name
+            )
 
     def rmod(self, left, right) -> Union["Series", "Index"]:
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = left.spark.transform(lambda scol: scol.cast(as_spark_type(type(right))))
             return right % left
         else:
             raise TypeError(
-                "Modulo can not be applied to %s and the given type." % self.pretty_name)
+                "Modulo can not be applied to %s and the given type." % self.pretty_name
+            )
@@ -25,4 +25,4 @@ class CategoricalOps(DataTypeOps):
 
     @property
     def pretty_name(self) -> str:
-        return 'categoricals'
+        return "categoricals"
@@ -34,22 +34,25 @@ class ArrayOps(DataTypeOps):
 
     @property
     def pretty_name(self) -> str:
-        return 'arrays'
+        return "arrays"
 
     def add(self, left, right) -> Union["Series", "Index"]:
         if not isinstance(right, IndexOpsMixin) or (
             isinstance(right, IndexOpsMixin) and not isinstance(right.spark.data_type, ArrayType)
         ):
             raise TypeError(
-                "Concatenation can not be applied to %s and the given type." % self.pretty_name)
+                "Concatenation can not be applied to %s and the given type." % self.pretty_name
+            )
 
         left_type = left.spark.data_type.elementType
         right_type = right.spark.data_type.elementType
 
         if left_type != right_type and not (
-                isinstance(left_type, NumericType) and isinstance(right_type, NumericType)):
+            isinstance(left_type, NumericType) and isinstance(right_type, NumericType)
+        ):
             raise TypeError(
-                "Concatenation can only be applied to %s of the same type" % self.pretty_name)
+                "Concatenation can only be applied to %s of the same type" % self.pretty_name
+            )
 
         return column_op(F.concat)(left, right)
 
@@ -61,7 +64,7 @@ class MapOps(DataTypeOps):
 
     @property
     def pretty_name(self) -> str:
-        return 'maps'
+        return "maps"
 
 
 class StructOps(DataTypeOps):
@@ -71,4 +74,4 @@ class StructOps(DataTypeOps):
 
     @property
     def pretty_name(self) -> str:
-        return 'structs'
+        return "structs"