pandas-dev · fangchenli · Apr 11, 2025 · Apr 12, 2025 · Apr 13, 2025 · Apr 13, 2025
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -9,7 +9,9 @@
 import codecs
 from collections import defaultdict
 from collections.abc import (
+    Generator,
     Hashable,
+    Iterable,
     Mapping,
     Sequence,
 )
@@ -26,7 +28,10 @@
 )
 import mmap
 import os
-from pathlib import Path
+from pathlib import (
+    Path,
+    PurePosixPath,
+)
 import re
 import tarfile
 from typing import (
@@ -55,6 +60,7 @@
     BaseBuffer,
     ReadCsvBuffer,
 )
+from pandas.compat import is_platform_windows
 from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import doc
 from pandas.util._exceptions import find_stack_level
@@ -1282,3 +1288,133 @@ def dedup_names(
         counts[col] = cur_count + 1
 
     return names
+
+
+def _infer_protocol(path: str) -> str:
+    # Treat Windows drive letters like C:\ as local file paths
+    if is_platform_windows() and re.match(r"^[a-zA-Z]:[\\/]", path):
+        return "file"
+
+    parsed = parse_url(path)
+    if parsed.scheme in _VALID_URLS:
+        return parsed.scheme
+    return "file"
+
+
+def _match_file(
+    path: Path | PurePosixPath, extensions: set[str] | None, glob: str | None
+) -> bool:
+    """Check if the file matches the given extensions and glob pattern.
+    Parameters
+    ----------
+    path : Path or PurePosixPath
+        The file path to check.
+    extensions : set[str]
+        A set of file extensions to match against.
+    glob : str
+        A glob pattern to match against.
+    Returns
+    -------
+    bool
+        True if the file matches the extensions and glob pattern, False otherwise.
+    """
+    return (extensions is None or path.suffix.lower() in extensions) and (
+        glob is None or path.match(glob)
+    )
+
+
+def iterdir(
+    path: FilePath,
+    extensions: str | Iterable[str] | None = None,
+    glob: str | None = None,
+) -> Generator[Path | PurePosixPath]:
+    """Yield file paths in a directory (no nesting allowed).
+
+    Supports:
+    - Local paths (str, os.PathLike)
+    - file:// URLs
+    - Remote paths (e.g., s3://) via fsspec (if installed)
+
+    Parameters
+    ----------
+    path : FilePath
+        Path to the directory (local or remote).
+    extensions : str or list of str, optional
+        Only yield files with the given extension(s). Case-insensitive.
+        If None, all files are yielded.
+    glob : str, optional
+        Only yield files matching the given glob pattern.
+        If None, all files are yielded.
+
+    Yields
+    ------
+    pathlib.Path or pathlib.PurePosixPath
+        File paths within the directory.
+
+    Raises
+    ------
+    NotADirectoryError
+        If the given path is not a directory.
+    ImportError
+        If fsspec is required but not installed.
+    """
+    if extensions is not None:
+        if isinstance(extensions, str):
+            extensions = {extensions.lower()}
+        else:
+            extensions = {ext.lower() for ext in extensions}
+
+    path_str = os.fspath(path)
+    scheme = _infer_protocol(path_str)
+
+    if scheme == "file":
+        resolved_path = Path(path_str)
+        if resolved_path.is_file():
+            if _match_file(
+                resolved_path,
+                extensions,
+                glob,
+            ):
+                yield resolved_path
+            return
+
+        if not resolved_path.is_dir():
+            raise NotADirectoryError(
+                f"Path {path!r} is neither a file nor a directory."
+            )
+
+        for entry in resolved_path.iterdir():
+            if entry.is_file():
+                if _match_file(
+                    entry,
+                    extensions,
+                    glob,
+                ):
+                    yield entry
+        return
+
+    # Remote paths (e.g., s3)
+    fsspec = import_optional_dependency("fsspec", extra=scheme)
+    fs = fsspec.filesystem(scheme)
+    if fs.isfile(path):
+        path_obj = PurePosixPath(path)
+        if _match_file(
+            path_obj,
+            extensions,
+            glob,
+        ):
+            yield path_obj
+        return
+    if not fs.isdir(path):
+        raise NotADirectoryError(f"Path {path!r} is neither a file nor a directory.")
+
+    files = fs.ls(path, detail=True)
+    for f in files:
+        if f["type"] == "file":
+            path_obj = PurePosixPath(f["name"])
+            if _match_file(
+                path_obj,
+                extensions,
+                glob,
+            ):
+                yield path_obj
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -223,3 +223,14 @@ def compression_format(request):
 @pytest.fixture(params=_compression_formats_params)
 def compression_ext(request):
     return request.param[0]
+
+
+@pytest.fixture
+def directory_with_dummy_csv(tmp_path):
+    """
+    Fixture to create a directory with dummy CSV files for testing.
+    """
+    for i in range(3):
+        file_path = tmp_path / f"file_{i}.csv"
+        file_path.touch()
+    return tmp_path
diff --git a/pandas/tests/io/parser/test_directory.py b/pandas/tests/io/parser/test_directory.py
@@ -0,0 +1,37 @@
+from csv import (
+    DictWriter,
+    reader as csv_reader,
+)
+
+import pytest
+
+
+@pytest.fixture
+def directory_data():
+    return ["a", "b", "c"], [
+        {"first": {"a": 1, "b": 2, "c": 3}},
+        {"second": {"a": 4, "b": 5, "c": 6}},
+        {"third": {"a": 7, "b": 8, "c": 9}},
+    ]
+
+
+@pytest.fixture
+def directory_data_to_file(tmp_path, directory_data):
+    field_names, data_list = directory_data
+    for data in data_list:
+        file_name = next(iter(data.keys()))
+        path = tmp_path / f"{file_name}.csv"
+        with path.open("w", newline="", encoding="utf-8") as file:
+            writer = DictWriter(file, fieldnames=field_names)
+            writer.writeheader()
+            writer.writerow(data[file_name])
+    return tmp_path
+
+
+def test_directory_data(directory_data_to_file):
+    assert len(list(directory_data_to_file.iterdir())) == 3
+    for file in directory_data_to_file.iterdir():
+        with file.open(encoding="utf-8") as f:
+            reader = csv_reader(f)
+            header = next(reader)
+            assert header == ["a", "b", "c"]
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -695,3 +695,10 @@ def test_pyarrow_read_csv_datetime_dtype():
     expect = pd.DataFrame({"date": expect_data})
 
     tm.assert_frame_equal(expect, result)
+
+
+def test_iterdir(directory_with_dummy_csv):
+    for file in icom.iterdir(directory_with_dummy_csv):
+        assert file.is_file()
+        assert file.name.startswith("file_")
+        assert file.suffix == ".csv"
diff --git a/web/pandas/static/img/books/pandas_cookbook_3.jpeg b/web/pandas/static/img/books/pandas_cookbook_3.jpeg