Skip to content

Commit 03694ef

Browse files
authored
More stability checker options (#5299)
## Summary This contains three changes: * repos in `check_ecosystem.py` are stored as `org:name` instead of `org/name` to create a flat directory layout * `check_ecosystem.py` performs a maximum of 50 parallel jobs at the same time to avoid consuming to much RAM * `check-formatter-stability` gets a new option `--multi-project` so it's possible to do `cargo run --bin ruff_dev -- check-formatter-stability --multi-project target/checkouts` With these three changes it becomes easy to check the formatter stability over a larger number of repositories. This is part of the integration of integrating formatter regressions checks into the ecosystem checks. ## Test Plan ```shell python scripts/check_ecosystem.py --checkouts target/checkouts --projects github_search.jsonl -v $(which true) $(which true) cargo run --bin ruff_dev -- check-formatter-stability --multi-project target/checkouts ```
1 parent f9f0cf7 commit 03694ef

File tree

2 files changed

+68
-17
lines changed

2 files changed

+68
-17
lines changed

crates/ruff_dev/src/check_formatter_stability.rs

+46-7
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ pub(crate) struct Args {
4242
/// Print only the first error and exit, `-x` is same as pytest
4343
#[arg(long, short = 'x')]
4444
pub(crate) exit_first_error: bool,
45+
/// Checks each project inside a directory
46+
#[arg(long)]
47+
pub(crate) multi_project: bool,
4548
}
4649

4750
/// Generate ourself a `try_parse_from` impl for `CheckArgs`. This is a strange way to use clap but
@@ -54,6 +57,35 @@ struct WrapperArgs {
5457
}
5558

5659
pub(crate) fn main(args: &Args) -> anyhow::Result<ExitCode> {
60+
let all_success = if args.multi_project {
61+
let mut all_success = true;
62+
for base_dir in &args.files {
63+
for dir in base_dir.read_dir()? {
64+
let dir = dir?;
65+
println!("Starting {}", dir.path().display());
66+
let success = check_repo(&Args {
67+
files: vec![dir.path().clone()],
68+
..*args
69+
});
70+
println!("Finished {}: {:?}", dir.path().display(), success);
71+
if !matches!(success, Ok(true)) {
72+
all_success = false;
73+
}
74+
}
75+
}
76+
all_success
77+
} else {
78+
check_repo(args)?
79+
};
80+
if all_success {
81+
Ok(ExitCode::SUCCESS)
82+
} else {
83+
Ok(ExitCode::FAILURE)
84+
}
85+
}
86+
87+
/// Returns whether the check was successful
88+
pub(crate) fn check_repo(args: &Args) -> anyhow::Result<bool> {
5789
let start = Instant::now();
5890

5991
// Find files to check (or in this case, format twice). Adapted from ruff_cli
@@ -77,13 +109,20 @@ pub(crate) fn main(args: &Args) -> anyhow::Result<ExitCode> {
77109
let (paths, _resolver) = python_files_in_path(&cli.files, &pyproject_config, &overrides)?;
78110
assert!(!paths.is_empty(), "no python files in {:?}", cli.files);
79111

112+
let mut formatted_counter = 0;
80113
let errors = paths
81114
.into_iter()
82115
.map(|dir_entry| {
83116
// Doesn't make sense to recover here in this test script
84-
let file = dir_entry
85-
.expect("Iterating the files in the repository failed")
86-
.into_path();
117+
dir_entry.expect("Iterating the files in the repository failed")
118+
})
119+
.filter(|dir_entry| {
120+
// For some reason it does not filter in the beginning
121+
dir_entry.file_name() != "pyproject.toml"
122+
})
123+
.map(|dir_entry| {
124+
let file = dir_entry.path().to_path_buf();
125+
formatted_counter += 1;
87126
// Handle panics (mostly in `debug_assert!`)
88127
let result = match catch_unwind(|| check_file(&file)) {
89128
Ok(result) => result,
@@ -166,20 +205,20 @@ Formatted twice:
166205
}
167206

168207
if args.exit_first_error {
169-
return Ok(ExitCode::FAILURE);
208+
return Ok(false);
170209
}
171210
}
172211
let duration = start.elapsed();
173212
println!(
174213
"Formatting {} files twice took {:.2}s",
175-
cli.files.len(),
214+
formatted_counter,
176215
duration.as_secs_f32()
177216
);
178217

179218
if any_errors {
180-
Ok(ExitCode::FAILURE)
219+
Ok(false)
181220
} else {
182-
Ok(ExitCode::SUCCESS)
221+
Ok(true)
183222
}
184223
}
185224

scripts/check_ecosystem.py

+22-10
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ class Repository(NamedTuple):
4444
async def clone(self: Self, checkout_dir: Path) -> AsyncIterator[Path]:
4545
"""Shallow clone this repository to a temporary directory."""
4646
if checkout_dir.exists():
47-
logger.debug(f"Reusing {self.org}/{self.repo}")
47+
logger.debug(f"Reusing {self.org}:{self.repo}")
4848
yield Path(checkout_dir)
4949
return
5050

51-
logger.debug(f"Cloning {self.org}/{self.repo}")
51+
logger.debug(f"Cloning {self.org}:{self.repo}")
5252
git_command = [
5353
"git",
5454
"clone",
@@ -177,18 +177,17 @@ async def compare(
177177
"""Check a specific repository against two versions of ruff."""
178178
removed, added = set(), set()
179179

180-
# Allows to keep the checkouts locations
180+
# By the default, the git clone are transient, but if the user provides a
181+
# directory for permanent storage we keep it there
181182
if checkouts:
182-
checkout_parent = checkouts.joinpath(repo.org)
183-
# Don't create the repodir itself, we need that for checking for existing
184-
# clones
185-
checkout_parent.mkdir(exist_ok=True, parents=True)
186-
location_context = nullcontext(checkout_parent)
183+
location_context = nullcontext(checkouts)
187184
else:
188185
location_context = tempfile.TemporaryDirectory()
189186

190187
with location_context as checkout_parent:
191-
checkout_dir = Path(checkout_parent).joinpath(repo.repo)
188+
assert ":" not in repo.org
189+
assert ":" not in repo.repo
190+
checkout_dir = Path(checkout_parent).joinpath(f"{repo.org}:{repo.repo}")
192191
async with repo.clone(checkout_dir) as path:
193192
try:
194193
async with asyncio.TaskGroup() as tg:
@@ -284,8 +283,19 @@ async def main(
284283

285284
logger.debug(f"Checking {len(repositories)} projects")
286285

286+
# https://stackoverflow.com/a/61478547/3549270
287+
# Otherwise doing 3k repositories can take >8GB RAM
288+
semaphore = asyncio.Semaphore(50)
289+
290+
async def limited_parallelism(coroutine): # noqa: ANN
291+
async with semaphore:
292+
return await coroutine
293+
287294
results = await asyncio.gather(
288-
*[compare(ruff1, ruff2, repo, checkouts) for repo in repositories.values()],
295+
*[
296+
limited_parallelism(compare(ruff1, ruff2, repo, checkouts))
297+
for repo in repositories.values()
298+
],
289299
return_exceptions=True,
290300
)
291301

@@ -433,6 +443,8 @@ async def main(
433443
logging.basicConfig(level=logging.INFO)
434444

435445
loop = asyncio.get_event_loop()
446+
if args.checkouts:
447+
args.checkouts.mkdir(exist_ok=True, parents=True)
436448
main_task = asyncio.ensure_future(
437449
main(
438450
ruff1=args.ruff1,

0 commit comments

Comments
 (0)