-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
/
Copy pathsasreader.py
152 lines (124 loc) · 4.26 KB
/
sasreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Read SAS sas7bdat or xport files.
"""
from abc import ABCMeta, abstractmethod
from typing import TYPE_CHECKING, Optional, Union, overload
from pandas._typing import FilePathOrBuffer, Label
from pandas.io.common import stringify_path
if TYPE_CHECKING:
from pandas import DataFrame
# TODO(PY38): replace with Protocol in Python 3.8
class ReaderBase(metaclass=ABCMeta):
"""
Protocol for XportReader and SAS7BDATReader classes.
"""
@abstractmethod
def read(self, nrows=None):
pass
@abstractmethod
def close(self):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
@overload
def read_sas(
filepath_or_buffer: FilePathOrBuffer,
format: Optional[str] = ...,
index: Optional[Label] = ...,
encoding: Optional[str] = ...,
chunksize: int = ...,
iterator: bool = ...,
) -> ReaderBase:
...
@overload
def read_sas(
filepath_or_buffer: FilePathOrBuffer,
format: Optional[str] = ...,
index: Optional[Label] = ...,
encoding: Optional[str] = ...,
chunksize: None = ...,
iterator: bool = ...,
) -> Union["DataFrame", ReaderBase]:
...
def read_sas(
filepath_or_buffer: FilePathOrBuffer,
format: Optional[str] = None,
index: Optional[Label] = None,
encoding: Optional[str] = None,
chunksize: Optional[int] = None,
iterator: bool = False,
) -> Union["DataFrame", ReaderBase]:
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
Parameters
----------
filepath_or_buffer : str, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.sas``.
If you want to pass in a path object, pandas accepts any
``os.PathLike``.
By file-like object, we refer to objects with a ``read()`` method,
such as a file handle (e.g. via builtin ``open`` function)
or ``StringIO``.
format : str {'xport', 'sas7bdat'} or None
If None, file format is inferred from file extension. If 'xport' or
'sas7bdat', uses the corresponding format.
index : identifier of index column, defaults to None
Identifier of column that should be used as index of the DataFrame.
encoding : str, default is None
Encoding for text data. If None, text data are stored as raw bytes.
chunksize : int
Read file `chunksize` lines at a time, returns iterator.
.. versionchanged:: 1.2
``TextFileReader`` is a context manager.
iterator : bool, defaults to False
If True, returns an iterator for reading the file incrementally.
.. versionchanged:: 1.2
``TextFileReader`` is a context manager.
Returns
-------
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
or XportReader
"""
if format is None:
buffer_error_msg = (
"If this is a buffer object rather "
"than a string name, you must specify a format string"
)
filepath_or_buffer = stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
raise ValueError(buffer_error_msg)
fname = filepath_or_buffer.lower()
if fname.endswith(".xpt"):
format = "xport"
elif fname.endswith(".sas7bdat"):
format = "sas7bdat"
else:
raise ValueError("unable to infer format of SAS file")
reader: ReaderBase
if format.lower() == "xport":
from pandas.io.sas.sas_xport import XportReader
reader = XportReader(
filepath_or_buffer,
index=index,
encoding=encoding,
chunksize=chunksize,
)
elif format.lower() == "sas7bdat":
from pandas.io.sas.sas7bdat import SAS7BDATReader
reader = SAS7BDATReader(
filepath_or_buffer,
index=index,
encoding=encoding,
chunksize=chunksize,
)
else:
raise ValueError("unknown SAS format")
if iterator or chunksize:
return reader
with reader:
return reader.read()