Skip to content

Commit 16cf492

Browse files
committed
bug fix
1 parent 84d6bd3 commit 16cf492

File tree

4 files changed

+184
-1
lines changed

4 files changed

+184
-1
lines changed

pandas/io/common.py

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import codecs
1010
from collections import defaultdict
1111
from collections.abc import (
12+
Generator,
1213
Hashable,
1314
Mapping,
1415
Sequence,
@@ -26,7 +27,10 @@
2627
)
2728
import mmap
2829
import os
29-
from pathlib import Path
30+
from pathlib import (
31+
Path,
32+
PurePosixPath,
33+
)
3034
import re
3135
import tarfile
3236
from typing import (
@@ -1282,3 +1286,126 @@ def dedup_names(
12821286
counts[col] = cur_count + 1
12831287

12841288
return names
1289+
1290+
1291+
def _match_file(
1292+
path: Path | PurePosixPath, extensions: set[str] | None, glob: str | None
1293+
) -> bool:
1294+
"""Check if the file matches the given extensions and glob pattern.
1295+
Parameters
1296+
----------
1297+
path : Path or PurePosixPath
1298+
The file path to check.
1299+
extensions : set[str]
1300+
A set of file extensions to match against.
1301+
glob : str
1302+
A glob pattern to match against.
1303+
Returns
1304+
-------
1305+
bool
1306+
True if the file matches the extensions and glob pattern, False otherwise.
1307+
"""
1308+
return (extensions is None or path.suffix.lower() in extensions) and (
1309+
glob is None or path.match(glob)
1310+
)
1311+
1312+
1313+
def iterdir(
1314+
path: FilePath,
1315+
extensions: str | list[str] | None = None,
1316+
glob: str | None = None,
1317+
) -> Generator[Path | PurePosixPath]:
1318+
"""Yield file paths in a directory (no nesting allowed).
1319+
1320+
Supports:
1321+
- Local paths (str, os.PathLike)
1322+
- file:// URLs
1323+
- Remote paths (e.g., s3://) via fsspec (if installed)
1324+
1325+
Parameters
1326+
----------
1327+
path : FilePath
1328+
Path to the directory (local or remote).
1329+
extensions : str or list of str, optional
1330+
Only yield files with the given extension(s). Case-insensitive.
1331+
If None, all files are yielded.
1332+
glob : str, optional
1333+
Only yield files matching the given glob pattern.
1334+
If None, all files are yielded.
1335+
1336+
Yields
1337+
------
1338+
pathlib.Path or pathlib.PurePosixPath
1339+
File paths within the directory.
1340+
1341+
Raises
1342+
------
1343+
NotADirectoryError
1344+
If the given path is not a directory.
1345+
ImportError
1346+
If fsspec is required but not installed.
1347+
"""
1348+
if extensions is not None:
1349+
if isinstance(extensions, str):
1350+
extensions = {extensions.lower()}
1351+
else:
1352+
extensions = {ext.lower() for ext in extensions}
1353+
1354+
if isinstance(path, os.PathLike):
1355+
path = os.fspath(path)
1356+
1357+
parsed = parse_url(path)
1358+
scheme = parsed.scheme or "file"
1359+
base_path = parsed.path if scheme == "file" else path
1360+
1361+
if scheme == "file":
1362+
resolved_path = Path(base_path)
1363+
if resolved_path.is_file():
1364+
if _match_file(
1365+
resolved_path,
1366+
extensions,
1367+
glob,
1368+
):
1369+
yield resolved_path
1370+
return
1371+
1372+
if not resolved_path.is_dir():
1373+
raise NotADirectoryError(
1374+
f"Path {path!r} is neither a file nor a directory."
1375+
)
1376+
1377+
for entry in resolved_path.iterdir():
1378+
if entry.is_file():
1379+
if _match_file(
1380+
entry,
1381+
extensions,
1382+
glob,
1383+
):
1384+
yield entry
1385+
return
1386+
1387+
# Remote paths (e.g., s3)
1388+
fsspec = import_optional_dependency("fsspec", extra=scheme)
1389+
fs = fsspec.filesystem(scheme)
1390+
if fs.isfile(path):
1391+
path_obj = PurePosixPath(path)
1392+
if _match_file(
1393+
path_obj,
1394+
extensions,
1395+
glob,
1396+
):
1397+
yield path_obj
1398+
return
1399+
if not fs.isdir(path):
1400+
raise NotADirectoryError(f"Remote path {path!r} is not a directory.")
1401+
1402+
files = fs.ls(path, detail=True)
1403+
for f in files:
1404+
if f["type"] == "file":
1405+
path_obj = PurePosixPath(f["name"])
1406+
if _match_file(
1407+
path_obj,
1408+
extensions,
1409+
glob,
1410+
):
1411+
yield path_obj

pandas/tests/io/conftest.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,3 +223,14 @@ def compression_format(request):
223223
@pytest.fixture(params=_compression_formats_params)
224224
def compression_ext(request):
225225
return request.param[0]
226+
227+
228+
@pytest.fixture
229+
def directory_with_dummy_csv(tmp_path):
230+
"""
231+
Fixture to create a directory with dummy CSV files for testing.
232+
"""
233+
for i in range(3):
234+
file_path = tmp_path / f"file_{i}.csv"
235+
file_path.touch()
236+
return tmp_path
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from csv import (
2+
DictWriter,
3+
reader as csv_reader,
4+
)
5+
6+
import pytest
7+
8+
9+
@pytest.fixture
10+
def directory_data():
11+
return ["a", "b", "c"], [
12+
{"first": {"a": 1, "b": 2, "c": 3}},
13+
{"second": {"a": 4, "b": 5, "c": 6}},
14+
{"third": {"a": 7, "b": 8, "c": 9}},
15+
]
16+
17+
18+
@pytest.fixture
19+
def directory_data_to_file(tmp_path, directory_data):
20+
field_names, data_list = directory_data
21+
for data in data_list:
22+
file_name = next(iter(data.keys()))
23+
path = tmp_path / f"{file_name}.csv"
24+
with path.open("w", newline="") as file:
25+
writer = DictWriter(file, fieldnames=field_names)
26+
writer.writeheader()
27+
writer.writerow(data[file_name])
28+
return tmp_path
29+
30+
31+
def test_directory_data(directory_data_to_file):
32+
print(type(directory_data_to_file.iterdir()))
33+
assert len(list(directory_data_to_file.iterdir())) == 3
34+
for file in directory_data_to_file.iterdir():
35+
with file.open() as f:
36+
reader = csv_reader(f)
37+
header = next(reader)
38+
assert header == ["a", "b", "c"]

pandas/tests/io/test_common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,3 +695,10 @@ def test_pyarrow_read_csv_datetime_dtype():
695695
expect = pd.DataFrame({"date": expect_data})
696696

697697
tm.assert_frame_equal(expect, result)
698+
699+
700+
def test_iterdir(directory_with_dummy_csv):
701+
for file in icom.iterdir(directory_with_dummy_csv):
702+
assert file.is_file()
703+
assert file.name.startswith("file_")
704+
assert file.suffix == ".csv"

0 commit comments

Comments
 (0)