Skip to content

Commit bcc88bc

Browse files
committed
use pd_array in core
1 parent 602ab16 commit bcc88bc

File tree

4 files changed

+114
-2
lines changed

4 files changed

+114
-2
lines changed

.pre-commit-config.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,10 @@ repos:
228228
|\#\ type:\s?ignore(?!\[)
229229
language: pygrep
230230
types: [python]
231+
- id: use-pd_array-in-core
232+
name: Import pandas.array as pd_array in core
233+
language: python
234+
entry: python scripts/use_pd_array_in_core.py
235+
files: ^pandas/core/
236+
exclude: ^pandas/core/api\.py$
237+
types: [python]

pandas/core/strings/accessor.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3011,7 +3011,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
30113011
"""
30123012
from pandas import (
30133013
DataFrame,
3014-
array,
3014+
array as pd_array,
30153015
)
30163016

30173017
regex = re.compile(pat, flags=flags)
@@ -3022,7 +3022,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
30223022
result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
30233023
name = _get_single_group_name(regex)
30243024
# not dispatching, so we have to reconstruct here.
3025-
result = array(result, dtype=result_dtype)
3025+
result = pd_array(result, dtype=result_dtype)
30263026
else:
30273027
if isinstance(arr, ABCIndex):
30283028
raise ValueError("only one regex group is supported with Index")
+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import pytest
2+
3+
from scripts.use_pd_array_in_core import use_pd_array
4+
5+
BAD_FILE_0 = "import pandas as pd\npd.array"
6+
BAD_FILE_1 = "\nfrom pandas import array"
7+
GOOD_FILE_0 = "from pandas import array as pd_array"
8+
GOOD_FILE_1 = "from pandas.core.construction import array as pd_array"
9+
PATH = "t.py"
10+
11+
12+
@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1])
13+
def test_inconsistent_usage(content, capsys):
14+
result_msg = (
15+
r"t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n"
16+
)
17+
with pytest.raises(SystemExit):
18+
use_pd_array(content, PATH)
19+
expected_msg, _ = capsys.readouterr()
20+
assert result_msg == expected_msg
21+
22+
23+
@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1])
24+
def test_consistent_usage(content):
25+
# should not raise
26+
use_pd_array(content, PATH)

scripts/use_pd_array_in_core.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
Check that pandas/core imports pandas.array as pd_array.
3+
4+
This makes it easier to grep for usage of pandas array.
5+
6+
This is meant to be run as a pre-commit hook - to run it manually, you can do:
7+
8+
pre-commit run use-pd_array-in-core --all-files
9+
10+
"""
11+
12+
import argparse
13+
import ast
14+
import sys
15+
from typing import (
16+
Optional,
17+
Sequence,
18+
)
19+
20+
ERROR_MESSAGE = (
21+
"{path}:{lineno}:{col_offset}: "
22+
"Don't use pd.array in core, import array as pd_array instead\n"
23+
)
24+
25+
26+
class Visitor(ast.NodeVisitor):
27+
def __init__(self, path: str) -> None:
28+
self.path = path
29+
30+
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
31+
# If array has been imported from somewhere in pandas,
32+
# check it's aliased as pd_array.
33+
if (
34+
node.module is not None
35+
and node.module.startswith("pandas")
36+
and any(i.name == "array" and i.asname != "pd_array" for i in node.names)
37+
):
38+
msg = ERROR_MESSAGE.format(
39+
path=self.path, lineno=node.lineno, col_offset=node.col_offset
40+
)
41+
sys.stdout.write(msg)
42+
sys.exit(1)
43+
super().generic_visit(node)
44+
45+
def visit_Attribute(self, node: ast.Attribute) -> None:
46+
if (
47+
isinstance(node.value, ast.Name)
48+
and node.value.id == "pd"
49+
and node.attr == "array"
50+
):
51+
msg = ERROR_MESSAGE.format(
52+
path=self.path, lineno=node.lineno, col_offset=node.col_offset
53+
)
54+
sys.stdout.write(msg)
55+
sys.exit(1)
56+
super().generic_visit(node)
57+
58+
59+
def use_pd_array(content: str, path: str) -> None:
60+
tree = ast.parse(content)
61+
62+
visitor = Visitor(path)
63+
visitor.visit(tree)
64+
return
65+
66+
67+
def main(argv: Optional[Sequence[str]] = None) -> None:
68+
parser = argparse.ArgumentParser()
69+
parser.add_argument("paths", nargs="*")
70+
args = parser.parse_args(argv)
71+
72+
for path in args.paths:
73+
with open(path, encoding="utf-8") as fd:
74+
content = fd.read()
75+
use_pd_array(content, path)
76+
77+
78+
if __name__ == "__main__":
79+
main()

0 commit comments

Comments
 (0)