Skip to content

Commit 78dd035

Browse files
gfyoungjreback
authored andcommitted
CLN: Drop the as_recarray parameter in read_csv (pandas-dev#18804)
Deprecated back in 0.19.0 xref pandas-devgh-13373.
1 parent 8e0e354 commit 78dd035

File tree

10 files changed

+5
-332
lines changed

10 files changed

+5
-332
lines changed

doc/source/io.rst

-9
Original file line numberDiff line numberDiff line change
@@ -143,15 +143,6 @@ usecols : array-like or callable, default ``None``
143143
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
144144
145145
Using this parameter results in much faster parsing time and lower memory usage.
146-
as_recarray : boolean, default ``False``
147-
.. deprecated:: 0.18.2
148-
149-
Please call ``pd.read_csv(...).to_records()`` instead.
150-
151-
Return a NumPy recarray instead of a DataFrame after parsing the data. If
152-
set to ``True``, this option takes precedence over the ``squeeze`` parameter.
153-
In addition, as row indices are not available in such a format, the ``index_col``
154-
parameter will be ignored.
155146
squeeze : boolean, default ``False``
156147
If the parsed data only contains one column then return a Series.
157148
prefix : str, default ``None``

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ Removal of prior version deprecations/changes
222222
and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668)
223223
- ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`)
224224
- :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`)
225+
- :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`)
225226

226227
.. _whatsnew_0220.performance:
227228

pandas/_libs/parsers.pyx

+2-86
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ except NameError:
9191
basestring = str
9292

9393
cdef extern from "src/numpy_helper.h":
94-
object sarr_from_data(cnp.dtype, int length, void* data)
9594
void transfer_object_column(char *dst, char *src, size_t stride,
9695
size_t length)
9796

@@ -302,7 +301,6 @@ cdef class TextReader:
302301
object delimiter, converters, delim_whitespace
303302
object na_values
304303
object memory_map
305-
object as_recarray
306304
object header, orig_header, names, header_start, header_end
307305
object index_col
308306
object low_memory
@@ -334,8 +332,6 @@ cdef class TextReader:
334332

335333
converters=None,
336334

337-
as_recarray=False,
338-
339335
skipinitialspace=False,
340336
escapechar=None,
341337
doublequote=True,
@@ -489,8 +485,6 @@ cdef class TextReader:
489485
self.converters = converters
490486

491487
self.na_filter = na_filter
492-
self.as_recarray = as_recarray
493-
494488
self.compact_ints = compact_ints
495489
self.use_unsigned = use_unsigned
496490

@@ -903,14 +897,7 @@ cdef class TextReader:
903897
# Don't care about memory usage
904898
columns = self._read_rows(rows, 1)
905899

906-
if self.as_recarray:
907-
self._start_clock()
908-
result = _to_structured_array(columns, self.header, self.usecols)
909-
self._end_clock('Conversion to structured array')
910-
911-
return result
912-
else:
913-
return columns
900+
return columns
914901

915902
cdef _read_low_memory(self, rows):
916903
cdef:
@@ -999,7 +986,7 @@ cdef class TextReader:
999986
self._start_clock()
1000987
columns = self._convert_column_data(rows=rows,
1001988
footer=footer,
1002-
upcast_na=not self.as_recarray)
989+
upcast_na=True)
1003990
self._end_clock('Type conversion')
1004991

1005992
self._start_clock()
@@ -2321,77 +2308,6 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
23212308
return lib.maybe_convert_objects(result)
23222309

23232310

2324-
def _to_structured_array(dict columns, object names, object usecols):
2325-
cdef:
2326-
ndarray recs, column
2327-
cnp.dtype dt
2328-
dict fields
2329-
2330-
object name, fnames, field_type
2331-
Py_ssize_t i, offset, nfields, length
2332-
int64_t stride, elsize
2333-
char *buf
2334-
2335-
if names is None:
2336-
names = ['%d' % i for i in range(len(columns))]
2337-
else:
2338-
# single line header
2339-
names = names[0]
2340-
2341-
if usecols is not None:
2342-
names = [n for i, n in enumerate(names)
2343-
if i in usecols or n in usecols]
2344-
2345-
dt = np.dtype([(str(name), columns[i].dtype)
2346-
for i, name in enumerate(names)])
2347-
fnames = dt.names
2348-
fields = dt.fields
2349-
2350-
nfields = len(fields)
2351-
2352-
if PY3:
2353-
length = len(list(columns.values())[0])
2354-
else:
2355-
length = len(columns.values()[0])
2356-
2357-
stride = dt.itemsize
2358-
2359-
# We own the data.
2360-
buf = <char*> malloc(length * stride)
2361-
2362-
recs = sarr_from_data(dt, length, buf)
2363-
assert(recs.flags.owndata)
2364-
2365-
for i in range(nfields):
2366-
# XXX
2367-
field_type = fields[fnames[i]]
2368-
2369-
# (dtype, stride) tuple
2370-
offset = field_type[1]
2371-
elsize = field_type[0].itemsize
2372-
column = columns[i]
2373-
2374-
_fill_structured_column(buf + offset, <char*> column.data,
2375-
elsize, stride, length,
2376-
field_type[0] == np.object_)
2377-
2378-
return recs
2379-
2380-
2381-
cdef _fill_structured_column(char *dst, char* src, int64_t elsize,
2382-
int64_t stride, int64_t length, bint incref):
2383-
cdef:
2384-
int64_t i
2385-
2386-
if incref:
2387-
transfer_object_column(dst, src, stride, length)
2388-
else:
2389-
for i in range(length):
2390-
memcpy(dst, src, elsize)
2391-
dst += stride
2392-
src += elsize
2393-
2394-
23952311
def _maybe_encode(values):
23962312
if values is None:
23972313
return []

pandas/_libs/src/numpy_helper.h

-14
Original file line numberDiff line numberDiff line change
@@ -75,19 +75,6 @@ PANDAS_INLINE PyObject* char_to_string(char* data) {
7575
#endif
7676
}
7777

78-
PyObject* sarr_from_data(PyArray_Descr* descr, int length, void* data) {
79-
PyArrayObject* result;
80-
npy_intp dims[1] = {length};
81-
Py_INCREF(descr); // newfromdescr steals a reference to descr
82-
result = (PyArrayObject*)PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims,
83-
NULL, data, 0, NULL);
84-
85-
// Returned array doesn't own data by default
86-
result->flags |= NPY_OWNDATA;
87-
88-
return (PyObject*)result;
89-
}
90-
9178
void transfer_object_column(char* dst, char* src, size_t stride,
9279
size_t length) {
9380
size_t i;
@@ -105,7 +92,6 @@ void transfer_object_column(char* dst, char* src, size_t stride,
10592
}
10693
}
10794

108-
10995
void set_array_not_contiguous(PyArrayObject* ao) {
11096
ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS);
11197
}

pandas/io/parsers.py

+1-43
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,6 @@
108108
example of a valid callable argument would be ``lambda x: x.upper() in
109109
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
110110
parsing time and lower memory usage.
111-
as_recarray : boolean, default False
112-
.. deprecated:: 0.19.0
113-
Please call `pd.read_csv(...).to_records()` instead.
114-
115-
Return a NumPy recarray instead of a DataFrame after parsing the data.
116-
If set to True, this option takes precedence over the `squeeze` parameter.
117-
In addition, as row indices are not available in such a format, the
118-
`index_col` parameter will be ignored.
119111
squeeze : boolean, default False
120112
If the parsed data only contains one column then return a Series
121113
prefix : str, default None
@@ -506,7 +498,6 @@ def _read(filepath_or_buffer, kwds):
506498

507499
_c_parser_defaults = {
508500
'delim_whitespace': False,
509-
'as_recarray': False,
510501
'na_filter': True,
511502
'compact_ints': False,
512503
'use_unsigned': False,
@@ -532,14 +523,12 @@ def _read(filepath_or_buffer, kwds):
532523
}
533524

534525
_deprecated_defaults = {
535-
'as_recarray': None,
536526
'buffer_lines': None,
537527
'compact_ints': None,
538528
'use_unsigned': None,
539529
'tupleize_cols': None
540530
}
541531
_deprecated_args = {
542-
'as_recarray',
543532
'buffer_lines',
544533
'compact_ints',
545534
'use_unsigned',
@@ -614,7 +603,6 @@ def parser_f(filepath_or_buffer,
614603
# Internal
615604
doublequote=True,
616605
delim_whitespace=False,
617-
as_recarray=None,
618606
compact_ints=None,
619607
use_unsigned=None,
620608
low_memory=_c_parser_defaults['low_memory'],
@@ -685,7 +673,6 @@ def parser_f(filepath_or_buffer,
685673
compact_ints=compact_ints,
686674
use_unsigned=use_unsigned,
687675
delim_whitespace=delim_whitespace,
688-
as_recarray=as_recarray,
689676
warn_bad_lines=warn_bad_lines,
690677
error_bad_lines=error_bad_lines,
691678
low_memory=low_memory,
@@ -971,9 +958,7 @@ def _clean_options(self, options, engine):
971958
"and will be removed in a future version."
972959
.format(arg=arg))
973960

974-
if arg == 'as_recarray':
975-
msg += ' Please call pd.to_csv(...).to_records() instead.'
976-
elif arg == 'tupleize_cols':
961+
if arg == 'tupleize_cols':
977962
msg += (' Column tuples will then '
978963
'always be converted to MultiIndex.')
979964

@@ -1059,9 +1044,6 @@ def read(self, nrows=None):
10591044

10601045
ret = self._engine.read(nrows)
10611046

1062-
if self.options.get('as_recarray'):
1063-
return ret
1064-
10651047
# May alter columns / col_dict
10661048
index, columns, col_dict = self._create_index(ret)
10671049

@@ -1279,7 +1261,6 @@ def __init__(self, kwds):
12791261

12801262
self.true_values = kwds.get('true_values')
12811263
self.false_values = kwds.get('false_values')
1282-
self.as_recarray = kwds.get('as_recarray', False)
12831264
self.tupleize_cols = kwds.get('tupleize_cols', False)
12841265
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
12851266
self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
@@ -1295,9 +1276,6 @@ def __init__(self, kwds):
12951276
if isinstance(self.header, (list, tuple, np.ndarray)):
12961277
if not all(map(is_integer, self.header)):
12971278
raise ValueError("header must be integer or list of integers")
1298-
if kwds.get('as_recarray'):
1299-
raise ValueError("cannot specify as_recarray when "
1300-
"specifying a multi-index header")
13011279
if kwds.get('usecols'):
13021280
raise ValueError("cannot specify usecols when "
13031281
"specifying a multi-index header")
@@ -1900,10 +1878,6 @@ def read(self, nrows=None):
19001878
# Done with first read, next time raise StopIteration
19011879
self._first_chunk = False
19021880

1903-
if self.as_recarray:
1904-
# what to do if there are leading columns?
1905-
return data
1906-
19071881
names = self.names
19081882

19091883
if self._reader.leading_cols:
@@ -2306,9 +2280,6 @@ def read(self, rows=None):
23062280
columns, data = self._do_date_conversions(columns, data)
23072281

23082282
data = self._convert_data(data)
2309-
if self.as_recarray:
2310-
return self._to_recarray(data, columns)
2311-
23122283
index, columns = self._make_index(data, alldata, columns, indexnamerow)
23132284

23142285
return index, columns, data
@@ -2376,19 +2347,6 @@ def _clean_mapping(mapping):
23762347
clean_na_fvalues, self.verbose,
23772348
clean_conv, clean_dtypes)
23782349

2379-
def _to_recarray(self, data, columns):
2380-
dtypes = []
2381-
o = compat.OrderedDict()
2382-
2383-
# use the columns to "order" the keys
2384-
# in the unordered 'data' dictionary
2385-
for col in columns:
2386-
dtypes.append((str(col), data[col].dtype))
2387-
o[col] = data[col]
2388-
2389-
tuples = lzip(*o.values())
2390-
return np.array(tuples, dtypes)
2391-
23922350
def _infer_columns(self):
23932351
names = self.names
23942352
num_original_columns = 0

pandas/tests/io/parser/c_parser_only.py

-20
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import pandas.util.testing as tm
1919
import pandas.util._test_decorators as td
2020
from pandas import DataFrame
21-
from pandas import compat
2221
from pandas.compat import StringIO, range, lrange
2322

2423

@@ -161,25 +160,6 @@ def error(val):
161160
assert sum(precise_errors) <= sum(normal_errors)
162161
assert max(precise_errors) <= max(normal_errors)
163162

164-
def test_pass_dtype_as_recarray(self):
165-
if compat.is_platform_windows() and self.low_memory:
166-
pytest.skip(
167-
"segfaults on win-64, only when all tests are run")
168-
169-
data = """\
170-
one,two
171-
1,2.5
172-
2,3.5
173-
3,4.5
174-
4,5.5"""
175-
176-
with tm.assert_produces_warning(
177-
FutureWarning, check_stacklevel=False):
178-
result = self.read_csv(StringIO(data), dtype={
179-
'one': 'u1', 1: 'S1'}, as_recarray=True)
180-
assert result['one'].dtype == 'u1'
181-
assert result['two'].dtype == 'S1'
182-
183163
def test_usecols_dtypes(self):
184164
data = """\
185165
1,2,3

0 commit comments

Comments
 (0)