1
1
# Copyright (c) 2012, Lambda Foundry, Inc.
2
2
# See LICENSE for the license
3
- import bz2
4
3
from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC
5
4
from errno import ENOENT
6
- import gzip
7
- import io
8
- import os
9
5
import sys
10
6
import time
11
7
import warnings
12
- import zipfile
13
8
14
9
from libc.stdlib cimport free
15
10
from libc.string cimport strcasecmp, strlen, strncpy
16
11
17
12
import cython
18
13
from cython import Py_ssize_t
19
14
20
- from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString
15
+ from cpython.bytes cimport PyBytes_AsString
21
16
from cpython.exc cimport PyErr_Fetch, PyErr_Occurred
22
17
from cpython.object cimport PyObject
23
18
from cpython.ref cimport Py_XDECREF
@@ -67,7 +62,6 @@ from pandas._libs.khash cimport (
67
62
khiter_t,
68
63
)
69
64
70
- from pandas.compat import get_lzma_file, import_lzma
71
65
from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning
72
66
73
67
from pandas.core.dtypes.common import (
@@ -82,11 +76,10 @@ from pandas.core.dtypes.common import (
82
76
)
83
77
from pandas.core.dtypes.concat import union_categoricals
84
78
85
- lzma = import_lzma()
86
-
87
79
cdef:
88
80
float64_t INF = < float64_t> np.inf
89
81
float64_t NEGINF = - INF
82
+ int64_t DEFAULT_CHUNKSIZE = 256 * 1024
90
83
91
84
92
85
cdef extern from " headers/portable.h" :
@@ -275,14 +268,15 @@ cdef extern from "parser/io.h":
275
268
size_t * bytes_read, int * status)
276
269
277
270
278
- DEFAULT_CHUNKSIZE = 256 * 1024
279
-
280
-
281
271
cdef class TextReader:
282
272
"""
283
273
284
274
# source: StringIO or file object
285
275
276
+ ..versionchange:: 1.2.0
277
+ removed 'compression', 'memory_map', and 'encoding' argument.
278
+ These arguments are outsourced to CParserWrapper.
279
+ 'source' has to be a file handle.
286
280
"""
287
281
288
282
cdef:
@@ -299,16 +293,14 @@ cdef class TextReader:
299
293
300
294
cdef public:
301
295
int64_t leading_cols, table_width, skipfooter, buffer_lines
302
- bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory
296
+ bint allow_leading_cols, mangle_dupe_cols, low_memory
303
297
bint delim_whitespace
304
298
object delimiter, converters
305
299
object na_values
306
300
object header, orig_header, names, header_start, header_end
307
301
object index_col
308
302
object skiprows
309
303
object dtype
310
- object encoding
311
- object compression
312
304
object usecols
313
305
list dtype_cast_order
314
306
set unnamed_cols
@@ -321,18 +313,15 @@ cdef class TextReader:
321
313
header_end = 0 ,
322
314
index_col = None ,
323
315
names = None ,
324
- bint memory_map = False ,
325
316
tokenize_chunksize = DEFAULT_CHUNKSIZE,
326
317
bint delim_whitespace = False ,
327
- compression = None ,
328
318
converters = None ,
329
319
bint skipinitialspace = False ,
330
320
escapechar = None ,
331
321
bint doublequote = True ,
332
322
quotechar = b' "' ,
333
323
quoting = 0 ,
334
324
lineterminator = None ,
335
- encoding = None ,
336
325
comment = None ,
337
326
decimal = b' .' ,
338
327
thousands = None ,
@@ -356,15 +345,7 @@ cdef class TextReader:
356
345
bint skip_blank_lines = True ):
357
346
358
347
# set encoding for native Python and C library
359
- if encoding is not None :
360
- if not isinstance (encoding, bytes):
361
- encoding = encoding.encode(' utf-8' )
362
- encoding = encoding.lower()
363
- self .c_encoding = < char * > encoding
364
- else :
365
- self .c_encoding = NULL
366
-
367
- self .encoding = encoding
348
+ self .c_encoding = NULL
368
349
369
350
self .parser = parser_new()
370
351
self .parser.chunksize = tokenize_chunksize
@@ -374,9 +355,6 @@ cdef class TextReader:
374
355
# For timekeeping
375
356
self .clocks = []
376
357
377
- self .compression = compression
378
- self .memory_map = memory_map
379
-
380
358
self .parser.usecols = (usecols is not None )
381
359
382
360
self ._setup_parser_source(source)
@@ -562,11 +540,6 @@ cdef class TextReader:
562
540
parser_del(self .parser)
563
541
564
542
def close (self ):
565
- # we need to properly close an open derived
566
- # filehandle here, e.g. and UTFRecoder
567
- if self .handle is not None :
568
- self .handle.close()
569
-
570
543
# also preemptively free all allocated memory
571
544
parser_free(self .parser)
572
545
if self .true_set:
@@ -614,82 +587,15 @@ cdef class TextReader:
614
587
cdef:
615
588
void * ptr
616
589
617
- self .parser.cb_io = NULL
618
- self .parser.cb_cleanup = NULL
619
-
620
- if self .compression:
621
- if self .compression == ' gzip' :
622
- if isinstance (source, str ):
623
- source = gzip.GzipFile(source, ' rb' )
624
- else :
625
- source = gzip.GzipFile(fileobj = source)
626
- elif self .compression == ' bz2' :
627
- source = bz2.BZ2File(source, ' rb' )
628
- elif self .compression == ' zip' :
629
- zip_file = zipfile.ZipFile(source)
630
- zip_names = zip_file.namelist()
631
-
632
- if len (zip_names) == 1 :
633
- file_name = zip_names.pop()
634
- source = zip_file.open(file_name)
635
-
636
- elif len (zip_names) == 0 :
637
- raise ValueError (f' Zero files found in compressed '
638
- f' zip file {source}' )
639
- else :
640
- raise ValueError (f' Multiple files found in compressed '
641
- f' zip file {zip_names}' )
642
- elif self .compression == ' xz' :
643
- if isinstance (source, str ):
644
- source = get_lzma_file(lzma)(source, ' rb' )
645
- else :
646
- source = get_lzma_file(lzma)(filename = source)
647
- else :
648
- raise ValueError (f' Unrecognized compression type: '
649
- f' {self.compression}' )
650
-
651
- if (self .encoding and hasattr (source, " read" ) and
652
- not hasattr (source, " encoding" )):
653
- source = io.TextIOWrapper(
654
- source, self .encoding.decode(' utf-8' ), newline = ' ' )
655
-
656
- self .encoding = b' utf-8'
657
- self .c_encoding = < char * > self .encoding
658
-
659
- self .handle = source
660
-
661
- if isinstance (source, str ):
662
- encoding = sys.getfilesystemencoding() or " utf-8"
663
- usource = source
664
- source = source.encode(encoding)
665
-
666
- if self .memory_map:
667
- ptr = new_mmap(source)
668
- if ptr == NULL :
669
- # fall back
670
- ptr = new_file_source(source, self .parser.chunksize)
671
- self .parser.cb_io = & buffer_file_bytes
672
- self .parser.cb_cleanup = & del_file_source
673
- else :
674
- self .parser.cb_io = & buffer_mmap_bytes
675
- self .parser.cb_cleanup = & del_mmap
676
- else :
677
- ptr = new_file_source(source, self .parser.chunksize)
678
- self .parser.cb_io = & buffer_file_bytes
679
- self .parser.cb_cleanup = & del_file_source
680
- self .parser.source = ptr
681
-
682
- elif hasattr (source, ' read' ):
683
- # e.g., StringIO
684
-
685
- ptr = new_rd_source(source)
686
- self .parser.source = ptr
687
- self .parser.cb_io = & buffer_rd_bytes
688
- self .parser.cb_cleanup = & del_rd_source
689
- else :
590
+ if not hasattr (source, " read" ):
690
591
raise IOError (f' Expected file path name or file-like object, '
691
592
f' got {type(source)} type' )
692
593
594
+ ptr = new_rd_source(source)
595
+ self .parser.source = ptr
596
+ self .parser.cb_io = & buffer_rd_bytes
597
+ self .parser.cb_cleanup = & del_rd_source
598
+
693
599
cdef _get_header(self ):
694
600
# header is now a list of lists, so field_count should use header[0]
695
601
0 commit comments