@@ -107,6 +107,7 @@ class IOHandles:
107
107
handle : Buffer
108
108
created_handles : List [Buffer ] = dataclasses .field (default_factory = list )
109
109
is_wrapped : bool = False
110
+ is_mmap : bool = False
110
111
111
112
def close (self ) -> None :
112
113
"""
@@ -604,49 +605,49 @@ def get_handle(
604
605
except ImportError :
605
606
pass
606
607
607
- handles : List [Buffer ] = list ()
608
-
609
608
# Windows does not default to utf-8. Set to utf-8 for a consistent behavior
610
609
if encoding is None :
611
610
encoding = "utf-8"
612
611
613
612
# Convert pathlib.Path/py.path.local or string
614
- path_or_buf = stringify_path (path_or_buf )
615
- is_path = isinstance (path_or_buf , str )
616
- f = path_or_buf
613
+ handle = stringify_path (path_or_buf )
617
614
618
615
compression , compression_args = get_compression_method (compression )
619
- if is_path :
620
- compression = infer_compression (path_or_buf , compression )
616
+ compression = infer_compression (handle , compression )
621
617
622
- if compression :
618
+ # memory mapping needs to be the first step
619
+ handle , memory_map , handles = _maybe_memory_map (
620
+ handle , memory_map , encoding , mode , errors
621
+ )
623
622
623
+ is_path = isinstance (handle , str )
624
+ if compression :
624
625
# GZ Compression
625
626
if compression == "gzip" :
626
627
if is_path :
627
- assert isinstance (path_or_buf , str )
628
- f = gzip .GzipFile (filename = path_or_buf , mode = mode , ** compression_args )
628
+ assert isinstance (handle , str )
629
+ handle = gzip .GzipFile (filename = handle , mode = mode , ** compression_args )
629
630
else :
630
- f = gzip .GzipFile (
631
- fileobj = path_or_buf , # type: ignore[arg-type]
631
+ handle = gzip .GzipFile (
632
+ fileobj = handle , # type: ignore[arg-type]
632
633
mode = mode ,
633
634
** compression_args ,
634
635
)
635
636
636
637
# BZ Compression
637
638
elif compression == "bz2" :
638
- f = bz2 .BZ2File (
639
- path_or_buf , mode = mode , ** compression_args # type: ignore[arg-type]
639
+ handle = bz2 .BZ2File (
640
+ handle , mode = mode , ** compression_args # type: ignore[arg-type]
640
641
)
641
642
642
643
# ZIP Compression
643
644
elif compression == "zip" :
644
- f = _BytesZipFile (path_or_buf , mode , ** compression_args )
645
- if f .mode == "r" :
646
- handles .append (f )
647
- zip_names = f .namelist ()
645
+ handle = _BytesZipFile (handle , mode , ** compression_args )
646
+ if handle .mode == "r" :
647
+ handles .append (handle )
648
+ zip_names = handle .namelist ()
648
649
if len (zip_names ) == 1 :
649
- f = f .open (zip_names .pop ())
650
+ handle = handle .open (zip_names .pop ())
650
651
elif len (zip_names ) == 0 :
651
652
raise ValueError (f"Zero files found in ZIP file { path_or_buf } " )
652
653
else :
@@ -657,64 +658,52 @@ def get_handle(
657
658
658
659
# XZ Compression
659
660
elif compression == "xz" :
660
- f = get_lzma_file (lzma )(path_or_buf , mode )
661
+ handle = get_lzma_file (lzma )(handle , mode )
661
662
662
663
# Unrecognized Compression
663
664
else :
664
665
msg = f"Unrecognized compression type: { compression } "
665
666
raise ValueError (msg )
666
667
667
- assert not isinstance (f , str )
668
- handles .append (f )
668
+ assert not isinstance (handle , str )
669
+ handles .append (handle )
669
670
670
671
elif is_path :
671
672
# Check whether the filename is to be opened in binary mode.
672
673
# Binary mode does not support 'encoding' and 'newline'.
673
- is_binary_mode = "b" in mode
674
- assert isinstance (path_or_buf , str )
675
- if encoding and not is_binary_mode :
674
+ assert isinstance (handle , str )
675
+ if encoding and "b" not in mode :
676
676
# Encoding
677
- f = open (path_or_buf , mode , encoding = encoding , errors = errors , newline = "" )
677
+ handle = open (handle , mode , encoding = encoding , errors = errors , newline = "" )
678
678
else :
679
679
# Binary mode
680
- f = open (path_or_buf , mode )
681
- handles .append (f )
680
+ handle = open (handle , mode )
681
+ handles .append (handle )
682
682
683
683
# Convert BytesIO or file objects passed with an encoding
684
684
is_wrapped = False
685
685
if is_text and (
686
686
compression
687
- or isinstance (f , need_text_wrapping )
688
- or "b" in getattr (f , "mode" , "" )
687
+ or isinstance (handle , need_text_wrapping )
688
+ or "b" in getattr (handle , "mode" , "" )
689
689
):
690
- f = TextIOWrapper (
691
- f , encoding = encoding , errors = errors , newline = "" # type: ignore[arg-type]
690
+ handle = TextIOWrapper (
691
+ handle , # type: ignore[arg-type]
692
+ encoding = encoding ,
693
+ errors = errors ,
694
+ newline = "" ,
692
695
)
693
- handles .append (f )
696
+ handles .append (handle )
694
697
# do not mark as wrapped when the user provided a string
695
698
is_wrapped = not is_path
696
699
697
- if memory_map and hasattr (f , "fileno" ):
698
- assert not isinstance (f , str )
699
- try :
700
- wrapped = cast (mmap .mmap , _MMapWrapper (f )) # type: ignore[arg-type]
701
- f .close ()
702
- handles .remove (f )
703
- handles .append (wrapped )
704
- f = wrapped
705
- except Exception :
706
- # we catch any errors that may have occurred
707
- # because that is consistent with the lower-level
708
- # functionality of the C engine (pd.read_csv), so
709
- # leave the file handler as is then
710
- pass
711
-
712
700
handles .reverse () # close the most recently added buffer first
713
- assert not isinstance (f , str )
701
+ assert not isinstance (handle , str )
714
702
return IOHandles (
715
- handle = f ,
703
+ handle = handle ,
716
704
created_handles = handles ,
717
705
is_wrapped = is_wrapped ,
706
+ is_mmap = memory_map ,
718
707
)
719
708
720
709
@@ -778,9 +767,16 @@ class _MMapWrapper(abc.Iterator):
778
767
"""
779
768
780
769
def __init__ (self , f : IO ):
770
+ self .attributes = {}
771
+ for attribute in ("seekable" , "readable" , "writeable" ):
772
+ if not hasattr (f , attribute ):
773
+ continue
774
+ self .attributes [attribute ] = getattr (f , attribute )()
781
775
self .mmap = mmap .mmap (f .fileno (), 0 , access = mmap .ACCESS_READ )
782
776
783
777
def __getattr__ (self , name : str ):
778
+ if name in self .attributes :
779
+ return lambda : self .attributes [name ]
784
780
return getattr (self .mmap , name )
785
781
786
782
def __iter__ (self ) -> "_MMapWrapper" :
@@ -799,3 +795,42 @@ def __next__(self) -> str:
799
795
if newline == "" :
800
796
raise StopIteration
801
797
return newline
798
+
799
+
800
+ def _maybe_memory_map (
801
+ handle : FileOrBuffer ,
802
+ memory_map : bool ,
803
+ encoding : str ,
804
+ mode : str ,
805
+ errors : Optional [str ],
806
+ ) -> Tuple [FileOrBuffer , bool , List [Buffer ]]:
807
+ """Try to use memory map file/buffer."""
808
+ handles : List [Buffer ] = []
809
+ memory_map &= hasattr (handle , "fileno" ) or isinstance (handle , str )
810
+ if not memory_map :
811
+ return handle , memory_map , handles
812
+
813
+ # need to open the file first
814
+ if isinstance (handle , str ):
815
+ if encoding and "b" not in mode :
816
+ # Encoding
817
+ handle = open (handle , mode , encoding = encoding , errors = errors , newline = "" )
818
+ else :
819
+ # Binary mode
820
+ handle = open (handle , mode )
821
+ handles .append (handle )
822
+
823
+ try :
824
+ wrapped = cast (mmap .mmap , _MMapWrapper (handle )) # type: ignore[arg-type]
825
+ handle .close ()
826
+ handles .remove (handle )
827
+ handles .append (wrapped )
828
+ handle = wrapped
829
+ except Exception :
830
+ # we catch any errors that may have occurred
831
+ # because that is consistent with the lower-level
832
+ # functionality of the C engine (pd.read_csv), so
833
+ # leave the file handler as is then
834
+ memory_map = False
835
+
836
+ return handle , memory_map , handles
0 commit comments