Fix snapshot_download on very large repo (>50k files) (#3122)

Wauplin · Wauplin · commit 6f09ac7abed7 · 2025-05-27T11:21:27.000+02:00
* Fix snapshot_download on very large repo (&gt;50k files)

* use iterators

* fix typing issues
diff --git a/src/huggingface_hub/_snapshot_download.py b/src/huggingface_hub/_snapshot_download.py
@@ -1,6 +1,6 @@
 import os
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Union
+from typing import Dict, Iterable, List, Literal, Optional, Union
 
 import requests
 from tqdm.auto import tqdm as base_tqdm
@@ -15,13 +15,15 @@
     RevisionNotFoundError,
 )
 from .file_download import REGEX_COMMIT_HASH, hf_hub_download, repo_folder_name
-from .hf_api import DatasetInfo, HfApi, ModelInfo, SpaceInfo
+from .hf_api import DatasetInfo, HfApi, ModelInfo, RepoFile, SpaceInfo
 from .utils import OfflineModeIsEnabled, filter_repo_objects, logging, validate_hf_hub_args
 from .utils import tqdm as hf_tqdm
 
 
 logger = logging.get_logger(__name__)
 
+VERY_LARGE_REPO_THRESHOLD = 50000  # After this limit, we don't consider `repo_info.siblings` to be reliable enough
+
 
 @validate_hf_hub_args
 def snapshot_download(
@@ -145,20 +147,22 @@ def snapshot_download(
 
     storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
 
+    api = HfApi(
+        library_name=library_name,
+        library_version=library_version,
+        user_agent=user_agent,
+        endpoint=endpoint,
+        headers=headers,
+        token=token,
+    )
+
     repo_info: Union[ModelInfo, DatasetInfo, SpaceInfo, None] = None
     api_call_error: Optional[Exception] = None
     if not local_files_only:
         # try/except logic to handle different errors => taken from `hf_hub_download`
         try:
             # if we have internet connection we want to list files to download
-            api = HfApi(
-                library_name=library_name,
-                library_version=library_version,
-                user_agent=user_agent,
-                endpoint=endpoint,
-                headers=headers,
-            )
-            repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision, token=token)
+            repo_info = api.repo_info(repo_id=repo_id, repo_type=repo_type, revision=revision)
         except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
             # Actually raise for those subclasses of ConnectionError
             raise
@@ -251,13 +255,31 @@ def snapshot_download(
     # => let's download the files!
     assert repo_info.sha is not None, "Repo info returned from server must have a revision sha."
     assert repo_info.siblings is not None, "Repo info returned from server must have a siblings list."
-    filtered_repo_files = list(
-        filter_repo_objects(
-            items=[f.rfilename for f in repo_info.siblings],
-            allow_patterns=allow_patterns,
-            ignore_patterns=ignore_patterns,
+
+    # Corner case: on very large repos, the siblings list in `repo_info` might not contain all files.
+    # In that case, we need to use the `list_repo_tree` method to prevent caching issues.
+    repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings]
+    has_many_files = len(repo_info.siblings) > VERY_LARGE_REPO_THRESHOLD
+    if has_many_files:
+        logger.info("The repo has more than 50,000 files. Using `list_repo_tree` to ensure all files are listed.")
+        repo_files = (
+            f.rfilename
+            for f in api.list_repo_tree(repo_id=repo_id, recursive=True, revision=revision, repo_type=repo_type)
+            if isinstance(f, RepoFile)
         )
+
+    filtered_repo_files: Iterable[str] = filter_repo_objects(
+        items=repo_files,
+        allow_patterns=allow_patterns,
+        ignore_patterns=ignore_patterns,
     )
+
+    if not has_many_files:
+        filtered_repo_files = list(filtered_repo_files)
+        tqdm_desc = f"Fetching {len(filtered_repo_files)} files"
+    else:
+        tqdm_desc = "Fetching ... files"
+
     commit_hash = repo_info.sha
     snapshot_folder = os.path.join(storage_folder, "snapshots", commit_hash)
     # if passed revision is not identical to commit_hash
@@ -305,7 +327,7 @@ def _inner_hf_hub_download(repo_file: str):
         thread_map(
             _inner_hf_hub_download,
             filtered_repo_files,
-            desc=f"Fetching {len(filtered_repo_files)} files",
+            desc=tqdm_desc,
             max_workers=max_workers,
             # User can use its own tqdm class or the default one from `huggingface_hub.utils`
             tqdm_class=tqdm_class or hf_tqdm,
diff --git a/src/huggingface_hub/serialization/_torch.py b/src/huggingface_hub/serialization/_torch.py
@@ -246,7 +246,7 @@ def save_torch_state_dict(
             shared_tensors_to_discard=shared_tensors_to_discard,
         )
     else:
-        from torch import save as save_file_fn  # type: ignore[assignment]
+        from torch import save as save_file_fn  # type: ignore[assignment, no-redef]
 
         logger.warning(
             "You are using unsafe serialization. Due to security reasons, it is recommended not to load "

Original file line number	Diff line number	Diff line change
`@@ -246,7 +246,7 @@ def save_torch_state_dict(`
`246`	`246`	`shared_tensors_to_discard=shared_tensors_to_discard,`
`247`	`247`	`)`
`248`	`248`	`else:`
`249`		`- from torch import save as save_file_fn # type: ignore[assignment]`
	`249`	`+ from torch import save as save_file_fn # type: ignore[assignment, no-redef]`
`250`	`250`
`251`	`251`	`logger.warning(`
`252`	`252`	`"You are using unsafe serialization. Due to security reasons, it is recommended not to load "`