Skip to content

STYLE loosen inconsistent namespace check #40532

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
3 changes: 1 addition & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,10 @@ repos:
types: [python]
exclude: ^pandas/_typing\.py$
- id: inconsistent-namespace-usage
name: 'Check for inconsistent use of pandas namespace in tests'
name: 'Check for inconsistent use of pandas namespace'
entry: python scripts/check_for_inconsistent_pandas_namespace.py
language: python
types: [python]
files: ^pandas/tests/
- id: incorrect-code-directives
name: Check for incorrect code block or IPython directives
language: pygrep
Expand Down
20 changes: 9 additions & 11 deletions asv_bench/benchmarks/arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,7 @@ def setup(self, op, shape):
# construct dataframe with 2 blocks
arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8")
arr2 = np.random.randn(n_rows, n_cols // 2).astype("f4")
df = pd.concat(
[pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True
)
df = pd.concat([DataFrame(arr1), DataFrame(arr2)], axis=1, ignore_index=True)
# should already be the case, but just to be sure
df._consolidate_inplace()

Expand All @@ -151,7 +149,7 @@ def setup(self, op, shape):
arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8")
arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8")
df2 = pd.concat(
[pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)],
[DataFrame(arr1), DataFrame(arr2), DataFrame(arr3)],
axis=1,
ignore_index=True,
)
Expand Down Expand Up @@ -459,9 +457,9 @@ class OffsetArrayArithmetic:

def setup(self, offset):
N = 10000
rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
rng = date_range(start="1/1/2000", periods=N, freq="T")
self.rng = rng
self.ser = pd.Series(rng)
self.ser = Series(rng)

def time_add_series_offset(self, offset):
with warnings.catch_warnings(record=True):
Expand All @@ -478,7 +476,7 @@ class ApplyIndex:

def setup(self, offset):
N = 10000
rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
rng = date_range(start="1/1/2000", periods=N, freq="T")
self.rng = rng

def time_apply_index(self, offset):
Expand All @@ -490,17 +488,17 @@ class BinaryOpsMultiIndex:
param_names = ["func"]

def setup(self, func):
date_range = pd.date_range("20200101 00:00", "20200102 0:00", freq="S")
array = date_range("20200101 00:00", "20200102 0:00", freq="S")
level_0_names = [str(i) for i in range(30)]

index = pd.MultiIndex.from_product([level_0_names, date_range])
index = pd.MultiIndex.from_product([level_0_names, array])
column_names = ["col_1", "col_2"]

self.df = pd.DataFrame(
self.df = DataFrame(
np.random.rand(len(index), 2), index=index, columns=column_names
)

self.arg_df = pd.DataFrame(
self.arg_df = DataFrame(
np.random.randint(1, 10, (len(level_0_names), 2)),
index=level_0_names,
columns=column_names,
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def setup(self):
data = np.random.randn(N)[:-i]
idx = rng[:-i]
data[100:] = np.nan
self.series[i] = pd.Series(pd.SparseArray(data), index=idx)
self.series[i] = Series(SparseArray(data), index=idx)

def time_series_to_frame(self):
pd.DataFrame(self.series)
Expand Down Expand Up @@ -63,7 +63,7 @@ def setup(self):
)

def time_sparse_series_from_coo(self):
pd.Series.sparse.from_coo(self.matrix)
Series.sparse.from_coo(self.matrix)


class ToCoo:
Expand Down
14 changes: 7 additions & 7 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,12 @@ def box_expected(expected, box_cls, transpose=True):
"""
if box_cls is pd.array:
expected = pd.array(expected)
elif box_cls is pd.Index:
expected = pd.Index(expected)
elif box_cls is pd.Series:
expected = pd.Series(expected)
elif box_cls is pd.DataFrame:
expected = pd.Series(expected).to_frame()
elif box_cls is Index:
expected = Index(expected)
elif box_cls is Series:
expected = Series(expected)
elif box_cls is DataFrame:
expected = Series(expected).to_frame()
if transpose:
# for vector operations, we need a DataFrame to be a single-row,
# not a single-column, in order to operate against non-DataFrame
Expand Down Expand Up @@ -400,7 +400,7 @@ def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None)
"x": state.rand(n) * 2 - 1,
"y": state.rand(n) * 2 - 1,
}
df = pd.DataFrame(columns, index=index, columns=sorted(columns))
df = DataFrame(columns, index=index, columns=sorted(columns))
if df.index[-1] == end:
df = df.iloc[:-1]
return df
Expand Down
18 changes: 8 additions & 10 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def unique_nulls_fixture(request):
# ----------------------------------------------------------------


@pytest.fixture(params=[pd.DataFrame, pd.Series])
@pytest.fixture(params=[DataFrame, Series])
def frame_or_series(request):
"""
Fixture to parametrize over DataFrame and Series.
Expand All @@ -328,7 +328,7 @@ def frame_or_series(request):

# error: List item 0 has incompatible type "Type[Index]"; expected "Type[IndexOpsMixin]"
@pytest.fixture(
params=[pd.Index, pd.Series], ids=["index", "series"] # type: ignore[list-item]
params=[Index, Series], ids=["index", "series"] # type: ignore[list-item]
)
def index_or_series(request):
"""
Expand All @@ -346,9 +346,7 @@ def index_or_series(request):
index_or_series2 = index_or_series


@pytest.fixture(
params=[pd.Index, pd.Series, pd.array], ids=["index", "series", "array"]
)
@pytest.fixture(params=[Index, Series, pd.array], ids=["index", "series", "array"])
def index_or_series_or_array(request):
"""
Fixture to parametrize over Index, Series, and ExtensionArray
Expand Down Expand Up @@ -549,7 +547,7 @@ def index_with_missing(request):
# ----------------------------------------------------------------
@pytest.fixture
def empty_series():
return pd.Series([], index=[], dtype=np.float64)
return Series([], index=[], dtype=np.float64)


@pytest.fixture
Expand Down Expand Up @@ -586,7 +584,7 @@ def _create_series(index):
""" Helper for the _series dict """
size = len(index)
data = np.random.randn(size)
return pd.Series(data, index=index, name="a")
return Series(data, index=index, name="a")


_series = {
Expand Down Expand Up @@ -1409,16 +1407,16 @@ def any_numpy_dtype(request):
("boolean", [True, np.nan, False]),
("boolean", [True, pd.NA, False]),
("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]),
("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]),
("datetime", [Timestamp("20130101"), np.nan, Timestamp("20180101")]),
("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
# The following two dtypes are commented out due to GH 23554
# ('complex', [1 + 1j, np.nan, 2 + 2j]),
# ('timedelta64', [np.timedelta64(1, 'D'),
# np.nan, np.timedelta64(2, 'D')]),
("timedelta", [timedelta(1), np.nan, timedelta(2)]),
("time", [time(1), np.nan, time(2)]),
("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]),
("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]),
("period", [Period(2013), pd.NaT, Period(2018)]),
("interval", [Interval(0, 1), np.nan, Interval(0, 2)]),
]
ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id

Expand Down
10 changes: 5 additions & 5 deletions pandas/io/formats/style.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,8 @@ def __init__(
if not data.index.is_unique or not data.columns.is_unique:
raise ValueError("style is not supported for non-unique indices.")
self.data: DataFrame = data
self.index: pd.Index = data.index
self.columns: pd.Index = data.columns
self.index: Index = data.index
self.columns: Index = data.columns
self.table_styles = table_styles
if not isinstance(uuid_len, int) or not uuid_len >= 0:
raise TypeError("``uuid_len`` must be an integer in range [0, 32].")
Expand Down Expand Up @@ -914,7 +914,7 @@ def _apply(
result.columns = data.columns
else:
result = func(data, **kwargs)
if not isinstance(result, pd.DataFrame):
if not isinstance(result, DataFrame):
if not isinstance(result, np.ndarray):
raise TypeError(
f"Function {repr(func)} must return a DataFrame or ndarray "
Expand Down Expand Up @@ -1563,7 +1563,7 @@ def css(rgba) -> str:
if s.ndim == 1:
return [css(rgba) for rgba in rgbas]
else:
return pd.DataFrame(
return DataFrame(
[[css(rgba) for rgba in row] for row in rgbas],
index=s.index,
columns=s.columns,
Expand Down Expand Up @@ -1653,7 +1653,7 @@ def css(x):
if s.ndim == 1:
return [css(x) for x in normed]
else:
return pd.DataFrame(
return DataFrame(
[[css(x) for x in row] for row in normed],
index=s.index,
columns=s.columns,
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1372,9 +1372,9 @@ def array_likes(request):
data = memoryview(arr)
elif name == "array":
# stdlib array
from array import array as array_stdlib
import array

data = array_stdlib("i", arr)
data = array.array("i", arr)
elif name == "dask":
import dask.array

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,14 +1231,14 @@ def __len__(self, n):
def test_constructor_stdlib_array(self):
# GH 4297
# support Array
from array import array as stdlib_array
import array

result = DataFrame({"A": stdlib_array("i", range(10))})
result = DataFrame({"A": array.array("i", range(10))})
expected = DataFrame({"A": list(range(10))})
tm.assert_frame_equal(result, expected, check_dtype=False)

expected = DataFrame([list(range(10)), list(range(10))])
result = DataFrame([stdlib_array("i", range(10)), stdlib_array("i", range(10))])
result = DataFrame([array.array("i", range(10)), array.array("i", range(10))])
tm.assert_frame_equal(result, expected, check_dtype=False)

def test_constructor_range(self):
Expand Down
71 changes: 46 additions & 25 deletions scripts/check_for_inconsistent_pandas_namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Check that test suite file doesn't use the pandas namespace inconsistently.

We check for cases of ``Series`` and ``pd.Series`` appearing in the same file
(likewise for some other common classes).
(likewise for other pandas objects).

This is meant to be run as a pre-commit hook - to run it manually, you can do:

Expand All @@ -15,43 +15,50 @@
though note that you may need to manually fixup some imports and that you will also
need the additional dependency `tokenize-rt` (which is left out from the pre-commit
hook so that it uses the same virtualenv as the other local ones).

The general structure is similar to that of some plugins from
https://github.com/asottile/pyupgrade .
"""

import argparse
import ast
import sys
from typing import (
MutableMapping,
NamedTuple,
Optional,
Sequence,
Set,
Tuple,
)

ERROR_MESSAGE = "Found both `pd.{name}` and `{name}` in {path}"
EXCLUDE = {
"eval", # built-in, different from `pd.eval`
"np", # pd.np is deprecated but still tested
}
Offset = Tuple[int, int]
ERROR_MESSAGE = (
"{path}:{lineno}:{col_offset}: "
"Found both '{prefix}.{name}' and '{name}' in {path}"
)


class OffsetWithNamespace(NamedTuple):
lineno: int
col_offset: int
namespace: str


class Visitor(ast.NodeVisitor):
def __init__(self) -> None:
self.pandas_namespace: MutableMapping[Offset, str] = {}
self.no_namespace: Set[str] = set()
self.pandas_namespace: MutableMapping[OffsetWithNamespace, str] = {}
self.imported_from_pandas: Set[str] = set()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outside scope of the PR of course, but curious how hard it would be to generalize this to any namespace? Then you could see it being a helpful linting check even outside of pandas.

Copy link
Member Author

@MarcoGorelli MarcoGorelli Mar 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great idea, thanks! Added to my backlog


def visit_Attribute(self, node: ast.Attribute) -> None:
if (
isinstance(node.value, ast.Name)
and node.value.id == "pd"
and node.attr not in EXCLUDE
):
self.pandas_namespace[(node.lineno, node.col_offset)] = node.attr
if isinstance(node.value, ast.Name) and node.value.id in {"pandas", "pd"}:
offset_with_namespace = OffsetWithNamespace(
node.lineno, node.col_offset, node.value.id
)
self.pandas_namespace[offset_with_namespace] = node.attr
self.generic_visit(node)

def visit_Name(self, node: ast.Name) -> None:
if node.id not in EXCLUDE:
self.no_namespace.add(node.id)
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
if node.module is not None and "pandas" in node.module:
self.imported_from_pandas.update(name.name for name in node.names)
self.generic_visit(node)


Expand All @@ -64,9 +71,11 @@ def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str

tokens = src_to_tokens(content)
for n, i in reversed_enumerate(tokens):
offset_with_namespace = OffsetWithNamespace(i.offset[0], i.offset[1], i.src)
if (
i.offset in visitor.pandas_namespace
and visitor.pandas_namespace[i.offset] in visitor.no_namespace
offset_with_namespace in visitor.pandas_namespace
and visitor.pandas_namespace[offset_with_namespace]
in visitor.imported_from_pandas
):
# Replace `pd`
tokens[n] = i._replace(src="")
Expand All @@ -85,16 +94,28 @@ def check_for_inconsistent_pandas_namespace(
visitor = Visitor()
visitor.visit(tree)

inconsistencies = visitor.no_namespace.intersection(
inconsistencies = visitor.imported_from_pandas.intersection(
visitor.pandas_namespace.values()
)

if not inconsistencies:
# No inconsistent namespace usage, nothing to replace.
return content
return None

if not replace:
msg = ERROR_MESSAGE.format(name=inconsistencies.pop(), path=path)
raise RuntimeError(msg)
inconsistency = inconsistencies.pop()
lineno, col_offset, prefix = next(
key for key, val in visitor.pandas_namespace.items() if val == inconsistency
)
msg = ERROR_MESSAGE.format(
lineno=lineno,
col_offset=col_offset,
prefix=prefix,
name=inconsistency,
path=path,
)
sys.stdout.write(msg)
sys.exit(1)

return replace_inconsistent_pandas_namespace(visitor, content)

Expand Down
Loading