Skip to content

Commit c9458ad

Browse files
authored
http_backoff retry with SliceFileObj (#2542)
1 parent a49ca75 commit c9458ad

File tree

5 files changed

+117
-95
lines changed

5 files changed

+117
-95
lines changed

src/huggingface_hub/commands/lfs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@
2424
from typing import Dict, List, Optional
2525

2626
from huggingface_hub.commands import BaseHuggingfaceCLICommand
27-
from huggingface_hub.lfs import LFS_MULTIPART_UPLOAD_COMMAND, SliceFileObj
27+
from huggingface_hub.lfs import LFS_MULTIPART_UPLOAD_COMMAND
2828

2929
from ..utils import get_session, hf_raise_for_status, logging
30+
from ..utils._lfs import SliceFileObj
3031

3132

3233
logger = logging.get_logger(__name__)

src/huggingface_hub/lfs.py

Lines changed: 1 addition & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,8 @@
1616

1717
import inspect
1818
import io
19-
import os
2019
import re
2120
import warnings
22-
from contextlib import AbstractContextManager
2321
from dataclasses import dataclass
2422
from math import ceil
2523
from os.path import getsize
@@ -39,6 +37,7 @@
3937
tqdm,
4038
validate_hf_hub_args,
4139
)
40+
from .utils._lfs import SliceFileObj
4241
from .utils.sha import sha256, sha_fileobj
4342

4443

@@ -462,93 +461,3 @@ def _upload_parts_hf_transfer(
462461
if not supports_callback:
463462
progress.update(total)
464463
return output
465-
466-
467-
class SliceFileObj(AbstractContextManager):
468-
"""
469-
Utility context manager to read a *slice* of a seekable file-like object as a seekable, file-like object.
470-
471-
This is NOT thread safe
472-
473-
Inspired by stackoverflow.com/a/29838711/593036
474-
475-
Credits to @julien-c
476-
477-
Args:
478-
fileobj (`BinaryIO`):
479-
A file-like object to slice. MUST implement `tell()` and `seek()` (and `read()` of course).
480-
`fileobj` will be reset to its original position when exiting the context manager.
481-
seek_from (`int`):
482-
The start of the slice (offset from position 0 in bytes).
483-
read_limit (`int`):
484-
The maximum number of bytes to read from the slice.
485-
486-
Attributes:
487-
previous_position (`int`):
488-
The previous position
489-
490-
Examples:
491-
492-
Reading 200 bytes with an offset of 128 bytes from a file (ie bytes 128 to 327):
493-
```python
494-
>>> with open("path/to/file", "rb") as file:
495-
... with SliceFileObj(file, seek_from=128, read_limit=200) as fslice:
496-
... fslice.read(...)
497-
```
498-
499-
Reading a file in chunks of 512 bytes
500-
```python
501-
>>> import os
502-
>>> chunk_size = 512
503-
>>> file_size = os.getsize("path/to/file")
504-
>>> with open("path/to/file", "rb") as file:
505-
... for chunk_idx in range(ceil(file_size / chunk_size)):
506-
... with SliceFileObj(file, seek_from=chunk_idx * chunk_size, read_limit=chunk_size) as fslice:
507-
... chunk = fslice.read(...)
508-
509-
```
510-
"""
511-
512-
def __init__(self, fileobj: BinaryIO, seek_from: int, read_limit: int):
513-
self.fileobj = fileobj
514-
self.seek_from = seek_from
515-
self.read_limit = read_limit
516-
517-
def __enter__(self):
518-
self._previous_position = self.fileobj.tell()
519-
end_of_stream = self.fileobj.seek(0, os.SEEK_END)
520-
self._len = min(self.read_limit, end_of_stream - self.seek_from)
521-
# ^^ The actual number of bytes that can be read from the slice
522-
self.fileobj.seek(self.seek_from, io.SEEK_SET)
523-
return self
524-
525-
def __exit__(self, exc_type, exc_value, traceback):
526-
self.fileobj.seek(self._previous_position, io.SEEK_SET)
527-
528-
def read(self, n: int = -1):
529-
pos = self.tell()
530-
if pos >= self._len:
531-
return b""
532-
remaining_amount = self._len - pos
533-
data = self.fileobj.read(remaining_amount if n < 0 else min(n, remaining_amount))
534-
return data
535-
536-
def tell(self) -> int:
537-
return self.fileobj.tell() - self.seek_from
538-
539-
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
540-
start = self.seek_from
541-
end = start + self._len
542-
if whence in (os.SEEK_SET, os.SEEK_END):
543-
offset = start + offset if whence == os.SEEK_SET else end + offset
544-
offset = max(start, min(offset, end))
545-
whence = os.SEEK_SET
546-
elif whence == os.SEEK_CUR:
547-
cur_pos = self.fileobj.tell()
548-
offset = max(start - cur_pos, min(offset, end - cur_pos))
549-
else:
550-
raise ValueError(f"whence value {whence} is not supported")
551-
return self.fileobj.seek(offset, whence) - self.seek_from
552-
553-
def __iter__(self):
554-
yield self.read(n=4 * 1024 * 1024)

src/huggingface_hub/utils/_http.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
)
4444
from . import logging
4545
from ._fixes import JSONDecodeError
46+
from ._lfs import SliceFileObj
4647
from ._typing import HTTP_METHOD_T
4748

4849

@@ -290,7 +291,7 @@ def http_backoff(
290291
# first HTTP request. We need to save the initial position so that the full content
291292
# of the file is re-sent on http backoff. See warning tip in docstring.
292293
io_obj_initial_pos = None
293-
if "data" in kwargs and isinstance(kwargs["data"], io.IOBase):
294+
if "data" in kwargs and isinstance(kwargs["data"], (io.IOBase, SliceFileObj)):
294295
io_obj_initial_pos = kwargs["data"].tell()
295296

296297
session = get_session()

src/huggingface_hub/utils/_lfs.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# coding=utf-8
2+
# Copyright 2019-present, the HuggingFace Inc. team.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Git LFS related utilities"""
16+
17+
import io
18+
import os
19+
from contextlib import AbstractContextManager
20+
from typing import BinaryIO
21+
22+
23+
class SliceFileObj(AbstractContextManager):
24+
"""
25+
Utility context manager to read a *slice* of a seekable file-like object as a seekable, file-like object.
26+
27+
This is NOT thread safe
28+
29+
Inspired by stackoverflow.com/a/29838711/593036
30+
31+
Credits to @julien-c
32+
33+
Args:
34+
fileobj (`BinaryIO`):
35+
A file-like object to slice. MUST implement `tell()` and `seek()` (and `read()` of course).
36+
`fileobj` will be reset to its original position when exiting the context manager.
37+
seek_from (`int`):
38+
The start of the slice (offset from position 0 in bytes).
39+
read_limit (`int`):
40+
The maximum number of bytes to read from the slice.
41+
42+
Attributes:
43+
previous_position (`int`):
44+
The previous position
45+
46+
Examples:
47+
48+
Reading 200 bytes with an offset of 128 bytes from a file (ie bytes 128 to 327):
49+
```python
50+
>>> with open("path/to/file", "rb") as file:
51+
... with SliceFileObj(file, seek_from=128, read_limit=200) as fslice:
52+
... fslice.read(...)
53+
```
54+
55+
Reading a file in chunks of 512 bytes
56+
```python
57+
>>> import os
58+
>>> chunk_size = 512
59+
>>> file_size = os.getsize("path/to/file")
60+
>>> with open("path/to/file", "rb") as file:
61+
... for chunk_idx in range(ceil(file_size / chunk_size)):
62+
... with SliceFileObj(file, seek_from=chunk_idx * chunk_size, read_limit=chunk_size) as fslice:
63+
... chunk = fslice.read(...)
64+
65+
```
66+
"""
67+
68+
def __init__(self, fileobj: BinaryIO, seek_from: int, read_limit: int):
69+
self.fileobj = fileobj
70+
self.seek_from = seek_from
71+
self.read_limit = read_limit
72+
73+
def __enter__(self):
74+
self._previous_position = self.fileobj.tell()
75+
end_of_stream = self.fileobj.seek(0, os.SEEK_END)
76+
self._len = min(self.read_limit, end_of_stream - self.seek_from)
77+
# ^^ The actual number of bytes that can be read from the slice
78+
self.fileobj.seek(self.seek_from, io.SEEK_SET)
79+
return self
80+
81+
def __exit__(self, exc_type, exc_value, traceback):
82+
self.fileobj.seek(self._previous_position, io.SEEK_SET)
83+
84+
def read(self, n: int = -1):
85+
pos = self.tell()
86+
if pos >= self._len:
87+
return b""
88+
remaining_amount = self._len - pos
89+
data = self.fileobj.read(remaining_amount if n < 0 else min(n, remaining_amount))
90+
return data
91+
92+
def tell(self) -> int:
93+
return self.fileobj.tell() - self.seek_from
94+
95+
def seek(self, offset: int, whence: int = os.SEEK_SET) -> int:
96+
start = self.seek_from
97+
end = start + self._len
98+
if whence in (os.SEEK_SET, os.SEEK_END):
99+
offset = start + offset if whence == os.SEEK_SET else end + offset
100+
offset = max(start, min(offset, end))
101+
whence = os.SEEK_SET
102+
elif whence == os.SEEK_CUR:
103+
cur_pos = self.fileobj.tell()
104+
offset = max(start - cur_pos, min(offset, end - cur_pos))
105+
else:
106+
raise ValueError(f"whence value {whence} is not supported")
107+
return self.fileobj.seek(offset, whence) - self.seek_from
108+
109+
def __iter__(self):
110+
yield self.read(n=4 * 1024 * 1024)

tests/test_lfs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from hashlib import sha256
44
from io import BytesIO
55

6-
from huggingface_hub.lfs import SliceFileObj, UploadInfo
6+
from huggingface_hub.lfs import UploadInfo
77
from huggingface_hub.utils import SoftTemporaryDirectory
8+
from huggingface_hub.utils._lfs import SliceFileObj
89

910

1011
class TestUploadInfo(unittest.TestCase):

0 commit comments

Comments
 (0)