diff --git a/doc/source/io.rst b/doc/source/io.rst index 6aa2df3549914..6802a448c4e14 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -134,6 +134,14 @@ usecols : array-like, default ``None`` inferred from the document header row(s). For example, a valid `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter results in much faster parsing time and lower memory usage. +as_recarray : boolean, default ``False`` + DEPRECATED: this argument will be removed in a future version. Please call + ``pd.read_csv(...).to_records()`` instead. + + Return a NumPy recarray instead of a DataFrame after parsing the data. If + set to ``True``, this option takes precedence over the ``squeeze`` parameter. + In addition, as row indices are not available in such a format, the ``index_col`` + parameter will be ignored. squeeze : boolean, default ``False`` If the parsed data only contains one column then return a Series. prefix : str, default ``None`` @@ -179,9 +187,6 @@ low_memory : boolean, default ``True`` buffer_lines : int, default None DEPRECATED: this argument will be removed in a future version because its value is not respected by the parser - - If ``low_memory`` is ``True``, specify the number of rows to be read for - each chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 93aedce07da9d..1e95af2df247b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -295,6 +295,7 @@ Deprecations - ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) - ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) +- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) .. _whatsnew_0182.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 04b488aff5c0c..0f0e1848750c0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,7 +2,8 @@ Module contains tools for processing files into DataFrames or other objects """ from __future__ import print_function -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas.compat import (range, lrange, StringIO, lzip, zip, + string_types, map, OrderedDict) from pandas import compat from collections import defaultdict import re @@ -87,6 +88,14 @@ inferred from the document header row(s). For example, a valid `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter results in much faster parsing time and lower memory usage. +as_recarray : boolean, default False + DEPRECATED: this argument will be removed in a future version. Please call + `pd.read_csv(...).to_records()` instead. + + Return a NumPy recarray instead of a DataFrame after parsing the data. + If set to True, this option takes precedence over the `squeeze` parameter. + In addition, as row indices are not available in such a format, the + `index_col` parameter will be ignored. squeeze : boolean, default False If the parsed data only contains one column then return a Series prefix : str, default None @@ -239,9 +248,6 @@ buffer_lines : int, default None DEPRECATED: this argument will be removed in a future version because its value is not respected by the parser - - If low_memory is True, specify the number of rows to be read for each - chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version @@ -452,7 +458,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ - 'as_recarray', 'low_memory', 'memory_map', 'buffer_lines', @@ -462,6 +467,7 @@ def _read(filepath_or_buffer, kwds): 'float_precision', ]) _deprecated_args = set([ + 'as_recarray', 'buffer_lines', 'compact_ints', 'use_unsigned', @@ -820,12 +826,22 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) + depr_warning = '' + for arg in _deprecated_args: parser_default = _c_parser_defaults[arg] + msg = ("The '{arg}' argument has been deprecated " + "and will be removed in a future version." + .format(arg=arg)) + + if arg == 'as_recarray': + msg += ' Please call pd.to_csv(...).to_records() instead.' + if result.get(arg, parser_default) != parser_default: - warnings.warn("The '{arg}' argument has been deprecated " - "and will be removed in a future version" - .format(arg=arg), FutureWarning, stacklevel=2) + depr_warning += msg + '\n\n' + + if depr_warning != '': + warnings.warn(depr_warning, FutureWarning, stacklevel=2) if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") @@ -973,6 +989,7 @@ def __init__(self, kwds): self.na_fvalues = kwds.get('na_fvalues') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') + self.as_recarray = kwds.get('as_recarray', False) self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) @@ -1304,7 +1321,6 @@ def __init__(self, src, **kwds): self.kwds = kwds kwds = kwds.copy() - self.as_recarray = kwds.get('as_recarray', False) ParserBase.__init__(self, kwds) if 'utf-16' in (kwds.get('encoding') or ''): @@ -1889,6 +1905,9 @@ def read(self, rows=None): columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) + if self.as_recarray: + return self._to_recarray(data, columns) + index, columns = self._make_index(data, alldata, columns, indexnamerow) return index, columns, data @@ -1928,6 +1947,19 @@ def _convert_data(self, data): return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, clean_conv) + def _to_recarray(self, data, columns): + dtypes = [] + o = OrderedDict() + + # use the columns to "order" the keys + # in the unordered 'data' dictionary + for col in columns: + dtypes.append((str(col), data[col].dtype)) + o[col] = data[col] + + tuples = lzip(*o.values()) + return np.array(tuples, dtypes) + def _infer_columns(self): names = self.names num_original_columns = 0 diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index b7ef754004e18..90103064774c1 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -172,30 +172,6 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows(): - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False): - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - def test_pass_dtype(self): data = """\ one,two @@ -220,10 +196,12 @@ def test_pass_dtype_as_recarray(self): 3,4.5 4,5.5""" - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, - as_recarray=True) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'S1') + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 1: 'S1'}, as_recarray=True) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'S1') def test_empty_pass_dtype(self): data = 'one,two' diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index f8c7241fdf88a..fdaac71f59386 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -608,10 +608,6 @@ def test_url(self): @tm.slow def test_file(self): - - # FILE - if sys.version_info[:2] < (2, 6): - raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table.csv') local_table = self.read_table(localtable) @@ -925,8 +921,8 @@ def test_empty_with_nrows_chunksize(self): StringIO('foo,bar\n'), chunksize=10))) tm.assert_frame_equal(result, expected) - # 'as_recarray' is not supported yet for the Python parser - if self.engine == 'c': + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO('foo,bar\n'), nrows=10, as_recarray=True) result = DataFrame(result[2], columns=result[1], @@ -934,11 +930,13 @@ def test_empty_with_nrows_chunksize(self): tm.assert_frame_equal(DataFrame.from_records( result), expected, check_index_type=False) - result = next(iter(self.read_csv( - StringIO('foo,bar\n'), chunksize=10, as_recarray=True))) + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = next(iter(self.read_csv(StringIO('foo,bar\n'), + chunksize=10, as_recarray=True))) result = DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(DataFrame.from_records( - result), expected, check_index_type=False) + tm.assert_frame_equal(DataFrame.from_records(result), expected, + check_index_type=False) def test_eof_states(self): # see gh-10728, gh-10548 @@ -1373,3 +1371,90 @@ def test_compact_ints_use_unsigned(self): out = self.read_csv(StringIO(data), compact_ints=True, use_unsigned=True) tm.assert_frame_equal(out, expected) + + def test_compact_ints_as_recarray(self): + data = ('0,1,0,0\n' + '1,1,0,0\n' + '0,1,0,1') + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + def test_as_recarray(self): + # basic test + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + data = 'a,b\n1,a\n2,b' + expected = np.array([(1, 'a'), (2, 'b')], + dtype=[('a', '