Skip to content

STYLE use pd_array in core #40319

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 2, 2021
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,10 @@ repos:
|\#\ type:\s?ignore(?!\[)
language: pygrep
types: [python]
- id: use-pd_array-in-core
name: Import pandas.array as pd_array in core
language: python
entry: python scripts/use_pd_array_in_core.py
files: ^pandas/core/
exclude: ^pandas/core/api\.py$
types: [python]
4 changes: 2 additions & 2 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3023,7 +3023,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
"""
from pandas import (
DataFrame,
array,
array as pd_array,
)

regex = re.compile(pat, flags=flags)
Expand All @@ -3034,7 +3034,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
name = _get_single_group_name(regex)
# not dispatching, so we have to reconstruct here.
result = array(result, dtype=result_dtype)
result = pd_array(result, dtype=result_dtype)
else:
if isinstance(arr, ABCIndex):
raise ValueError("only one regex group is supported with Index")
Expand Down
26 changes: 26 additions & 0 deletions scripts/tests/test_use_pd_array_in_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest

from scripts.use_pd_array_in_core import use_pd_array

BAD_FILE_0 = "import pandas as pd\npd.array"
BAD_FILE_1 = "\nfrom pandas import array"
GOOD_FILE_0 = "from pandas import array as pd_array"
GOOD_FILE_1 = "from pandas.core.construction import array as pd_array"
PATH = "t.py"


@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1])
def test_inconsistent_usage(content, capsys):
result_msg = (
"t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n"
)
with pytest.raises(SystemExit):
use_pd_array(content, PATH)
expected_msg, _ = capsys.readouterr()
assert result_msg == expected_msg


@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1])
def test_consistent_usage(content):
# should not raise
use_pd_array(content, PATH)
77 changes: 77 additions & 0 deletions scripts/use_pd_array_in_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Check that pandas/core imports pandas.array as pd_array.
This makes it easier to grep for usage of pandas array.
This is meant to be run as a pre-commit hook - to run it manually, you can do:
pre-commit run use-pd_array-in-core --all-files
"""

import argparse
import ast
import sys
from typing import (
Optional,
Sequence,
)

ERROR_MESSAGE = (
"{path}:{lineno}:{col_offset}: "
"Don't use pd.array in core, import array as pd_array instead\n"
)


class Visitor(ast.NodeVisitor):
def __init__(self, path: str) -> None:
self.path = path

def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
# If array has been imported from somewhere in pandas,
# check it's aliased as pd_array.
if (
node.module is not None
and node.module.startswith("pandas")
and any(i.name == "array" and i.asname != "pd_array" for i in node.names)
):
msg = ERROR_MESSAGE.format(
path=self.path, lineno=node.lineno, col_offset=node.col_offset
)
sys.stdout.write(msg)
sys.exit(1)
super().generic_visit(node)

def visit_Attribute(self, node: ast.Attribute) -> None:
if (
isinstance(node.value, ast.Name)
and node.value.id == "pd"
and node.attr == "array"
):
msg = ERROR_MESSAGE.format(
path=self.path, lineno=node.lineno, col_offset=node.col_offset
)
sys.stdout.write(msg)
sys.exit(1)
super().generic_visit(node)


def use_pd_array(content: str, path: str) -> None:
tree = ast.parse(content)
visitor = Visitor(path)
visitor.visit(tree)


def main(argv: Optional[Sequence[str]] = None) -> None:
parser = argparse.ArgumentParser()
parser.add_argument("paths", nargs="*")
args = parser.parse_args(argv)

for path in args.paths:
with open(path, encoding="utf-8") as fd:
content = fd.read()
use_pd_array(content, path)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ omit =
pandas/_typing.py
pandas/_version.py
plugins = Cython.Coverage
source = pandas

[coverage:report]
ignore_errors = False
Expand Down