From 4038cfb12fd0d62b0f64d961c3416460e22e6eec Mon Sep 17 00:00:00 2001 From: "Richard T. Guy" Date: Mon, 30 Sep 2013 21:30:34 -0400 Subject: [PATCH] ENH: Add usecols option to python parser. Closes #4335 Added release note and fixed py3 compat Updated docs for consistency --- doc/source/io.rst | 8 +- doc/source/v0.13.0.txt | 16 +-- pandas/io/parsers.py | 209 +++++++++++++++++++++----------- pandas/io/tests/test_parsers.py | 128 +++++++++---------- 4 files changed, 215 insertions(+), 146 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index e75de91582b49..37227edc83fe2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -85,11 +85,11 @@ They can take a number of arguments: ways to specify the file format - ``dtype``: A data type name or a dict of column name to data type. If not specified, data types will be inferred. - - ``header``: row number to use as the column names, and the start of the + - ``header``: row number(s) to use as the column names, and the start of the data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly pass ``header=0`` to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns - E.g. [0,1,3]. Interveaning rows that are not specified will be skipped. + E.g. [0,1,3]. Intervening rows that are not specified will be skipped. (E.g. 2 in this example are skipped) - ``skiprows``: A collection of numbers for rows in the file to skip. Can also be an integer to skip the first ``n`` rows @@ -2938,7 +2938,7 @@ into BigQuery and pull it into a DataFrame. .. code-block:: python from pandas.io import gbq - + # Insert your BigQuery Project ID Here # Can be found in the web console, or # using the command line tool `bq ls` @@ -2998,7 +2998,7 @@ To add more rows to this, simply: To use this module, you will need a BigQuery account. See for details. - + As of 10/10/13, there is a bug in Google's API preventing result sets from being larger than 100,000 rows. A patch is scheduled for the week of 10/14/13. diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index b2c78f38140b4..603cffcc1b76b 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -505,11 +505,13 @@ Enhancements - :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table from semi-structured JSON data. See :ref:`the docs` (:issue:`1067`) - - Added PySide support for the qtpandas DataFrameModel and DataFrameWidget. +- Python csv parser now supports usecols (:issue:`4335`) + - DataFrame has a new ``interpolate`` method, similar to Series (:issue:`4434`, :issue:`1892`) + .. ipython:: python df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], @@ -654,7 +656,7 @@ Experimental against extremely large datasets. :ref:`See the docs ` .. code-block:: python - + from pandas.io import gbq # A query to select the average monthly temperatures in the @@ -665,8 +667,8 @@ Experimental query = """SELECT station_number as STATION, month as MONTH, AVG(mean_temp) as MEAN_TEMP FROM publicdata:samples.gsod - WHERE YEAR = 2000 - GROUP BY STATION, MONTH + WHERE YEAR = 2000 + GROUP BY STATION, MONTH ORDER BY STATION, MONTH ASC""" # Fetch the result set for this query @@ -675,7 +677,7 @@ Experimental # To find this, see your dashboard: # https://code.google.com/apis/console/b/0/?noredirect projectid = xxxxxxxxx; - + df = gbq.read_gbq(query, project_id = projectid) # Use pandas to process and reshape the dataset @@ -686,9 +688,9 @@ Experimental The resulting dataframe is:: - > df3 + > df3 Min Tem Mean Temp Max Temp - MONTH + MONTH 1 -53.336667 39.827892 89.770968 2 -49.837500 43.685219 93.437932 3 -77.926087 48.708355 96.099998 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 76d6a3909f89f..e9e82824326a7 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -52,11 +52,12 @@ dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details -header : int, default 0 if names parameter not specified, - Row to use for the column labels of the parsed DataFrame. Specify None if - there is no header row. Can be a list of integers that specify row - locations for a multi-index on the columns E.g. [0,1,3]. Interveaning - rows that are not specified (E.g. 2 in this example are skipped) +header : int row number(s) to use as the column names, and the start of the + data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly + pass ``header=0`` to be able to replace existing names. The header can be + a list of integers that specify row locations for a multi-index on the columns + E.g. [0,1,3]. Intervening rows that are not specified will be skipped. + (E.g. 2 in this example are skipped) skiprows : list-like or integer Row numbers to skip (0-indexed) or number of rows to skip (int) at the start of the file @@ -917,22 +918,6 @@ def _do_date_conversions(self, names, data): return names, data - def _exclude_implicit_index(self, alldata): - - if self._implicit_index: - excl_indices = self.index_col - - data = {} - offset = 0 - for i, col in enumerate(self.orig_names): - while i + offset in excl_indices: - offset += 1 - data[col] = alldata[i + offset] - else: - data = dict((k, v) for k, v in zip(self.orig_names, alldata)) - - return data - class CParserWrapper(ParserBase): """ @@ -1173,22 +1158,6 @@ def TextParser(*args, **kwds): return TextFileReader(*args, **kwds) -# delimiter=None, dialect=None, names=None, header=0, -# index_col=None, -# na_values=None, -# na_filter=True, -# thousands=None, -# quotechar='"', -# escapechar=None, -# doublequote=True, -# skipinitialspace=False, -# quoting=csv.QUOTE_MINIMAL, -# comment=None, parse_dates=False, keep_date_col=False, -# date_parser=None, dayfirst=False, -# chunksize=None, skiprows=None, skip_footer=0, converters=None, -# verbose=False, encoding=None, squeeze=False): - - def count_empty_vals(vals): return sum([1 for v in vals if v == '' or v is None]) @@ -1242,10 +1211,6 @@ def __init__(self, f, **kwds): self.buf = [] self.pos = 0 - if kwds['usecols'] is not None: - raise Exception("usecols not supported with engine='python'" - " or multicharacter separators (yet).") - self.encoding = kwds['encoding'] self.compression = kwds['compression'] self.skiprows = kwds['skiprows'] @@ -1259,7 +1224,10 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] - self.mangle_dupe_cols = kwds.get('mangle_dupe_cols',True) + self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) + self.usecols = kwds['usecols'] + + self.names_passed = kwds['names'] or None self.has_index_names = False if 'has_index_names' in kwds: @@ -1283,17 +1251,25 @@ def __init__(self, f, **kwds): f = TextIOWrapper(f, encoding=self.encoding) + # Set self.data to something that can read lines. if hasattr(f, 'readline'): self._make_reader(f) else: self.data = f - self.columns = self._infer_columns() + # Get columns in two steps: infer from data, then + # infer column indices from self.usecols if is is specified. + self._col_indices = None + self.columns, self.num_original_columns = self._infer_columns() - # we are processing a multi index column + # Now self.columns has the set of columns that we will process. + # The original set is stored in self.original_columns. if len(self.columns) > 1: + # we are processing a multi index column self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( self.columns, self.index_names, self.col_names) + # Update list of original names to include all indices. + self.num_original_columns = len(self.columns) else: self.columns = self.columns[0] @@ -1304,7 +1280,7 @@ def __init__(self, f, **kwds): # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: (index_names, - self.orig_names, _) = self._get_index_name(self.columns) + self.orig_names, columns_) = self._get_index_name(self.columns) self._name_processed = True if self.index_names is None: self.index_names = index_names @@ -1442,6 +1418,22 @@ def read(self, rows=None): return index, columns, data + def _exclude_implicit_index(self, alldata): + + if self._implicit_index: + excl_indices = self.index_col + + data = {} + offset = 0 + for i, col in enumerate(self.orig_names): + while i + offset in excl_indices: + offset += 1 + data[col] = alldata[i + offset] + else: + data = dict((k, v) for k, v in zip(self.orig_names, alldata)) + + return data + # legacy def get_chunk(self, size=None): if size is None: @@ -1462,7 +1454,7 @@ def _convert_data(self, data): def _infer_columns(self): names = self.names - + num_original_columns = 0 if self.header is not None: header = self.header @@ -1476,10 +1468,7 @@ def _infer_columns(self): columns = [] for level, hr in enumerate(header): - if len(self.buf) > 0: - line = self.buf[0] - else: - line = self._next_line() + line = self._buffered_line() while self.pos <= hr: line = self._next_line() @@ -1488,51 +1477,103 @@ def _infer_columns(self): for i, c in enumerate(line): if c == '': if have_mi_columns: - this_columns.append('Unnamed: %d_level_%d' % (i,level)) + this_columns.append('Unnamed: %d_level_%d' % (i, level)) else: this_columns.append('Unnamed: %d' % i) else: this_columns.append(c) - if not have_mi_columns: - if self.mangle_dupe_cols: - counts = {} - for i, col in enumerate(this_columns): - cur_count = counts.get(col, 0) - if cur_count > 0: - this_columns[i] = '%s.%d' % (col, cur_count) - counts[col] = cur_count + 1 + if not have_mi_columns and self.mangle_dupe_cols: + counts = {} + for i, col in enumerate(this_columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + this_columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 columns.append(this_columns) + if len(columns) == 1: + num_original_columns = len(this_columns) self._clear_buffer() if names is not None: - if len(names) != len(columns[0]): + if (self.usecols is not None and len(names) != len(self.usecols)) \ + or (self.usecols is None and len(names) != len(columns[0])): + raise ValueError('Number of passed names did not match ' - 'number of header fields in the file') + 'number of header fields in the file') if len(columns) > 1: raise TypeError('Cannot pass names with multi-index ' 'columns') - columns = [ names ] - else: - if len(self.buf) > 0: - line = self.buf[0] + if self.usecols is not None: + # Set _use_cols. We don't store columns because they are overwritten. + self._handle_usecols(columns, names) + else: + self._col_indices = None + num_original_columns = len(names) + columns = [names] else: - line = self._next_line() - + columns = self._handle_usecols(columns, columns[0]) + else: + # header is None + line = self._buffered_line() ncols = len(line) + num_original_columns = ncols if not names: if self.prefix: columns = [ ['X%d' % i for i in range(ncols)] ] else: columns = [ lrange(ncols) ] + columns = self._handle_usecols(columns, columns[0]) else: - columns = [ names ] + if self.usecols is None or len(names) == num_original_columns: + columns = self._handle_usecols([names], names) + num_original_columns = len(names) + else: + if self.usecols and len(names) != len(self.usecols): + raise ValueError('Number of passed names did not match ' + 'number of header fields in the file') + # Ignore output but set used columns. + self._handle_usecols([names], names) + columns = [names] + num_original_columns = ncols + return columns, num_original_columns + + def _handle_usecols(self, columns, usecols_key): + """ + Sets self._col_indices + + usecols_key is used if there are string usecols. + """ + if self.usecols is not None: + if any([isinstance(u, string_types) for u in self.usecols]): + if len(columns) > 1: + raise ValueError("If using multiple headers, usecols must be integers.") + col_indices = [] + for u in self.usecols: + if isinstance(u, string_types): + col_indices.append(usecols_key.index(u)) + else: + col_indices.append(u) + else: + col_indices = self.usecols + + columns = [[n for i, n in enumerate(column) if i in col_indices] for column in columns] + self._col_indices = col_indices return columns + def _buffered_line(self): + """ + Return a line from buffer, filling buffer if required. + """ + if len(self.buf) > 0: + return self.buf[0] + else: + return self._next_line() + def _next_line(self): if isinstance(self.data, list): while self.pos in self.skiprows: @@ -1598,6 +1639,17 @@ def _clear_buffer(self): _implicit_index = False def _get_index_name(self, columns): + """ + Try several cases to get lines: + + 0) There are headers on row 0 and row 1 and their + total summed lengths equals the length of the next line. + Treat row 0 as columns and row 1 as indices + 1) Look for implicit index: there are more columns + on row 1 than row 0. If this is true, assume that row + 1 lists index columns and row 0 lists normal columns. + 2) Get index from the columns if it was listed. + """ orig_names = list(columns) columns = list(columns) @@ -1615,29 +1667,34 @@ def _get_index_name(self, columns): implicit_first_cols = 0 if line is not None: # leave it 0, #2442 + # Case 1 if self.index_col is not False: - implicit_first_cols = len(line) - len(columns) + implicit_first_cols = len(line) - self.num_original_columns + # Case 0 if next_line is not None: - if len(next_line) == len(line) + len(columns): + if len(next_line) == len(line) + self.num_original_columns: # column and index names on diff rows - implicit_first_cols = 0 - self.index_col = lrange(len(line)) self.buf = self.buf[1:] for c in reversed(line): columns.insert(0, c) + # Update list of original names to include all indices. + self.num_original_columns = len(next_line) return line, columns, orig_names if implicit_first_cols > 0: + # Case 1 self._implicit_index = True if self.index_col is None: self.index_col = lrange(implicit_first_cols) + index_name = None else: + # Case 2 (index_name, columns, self.index_col) = _clean_index_names(columns, self.index_col) @@ -1646,7 +1703,7 @@ def _get_index_name(self, columns): def _rows_to_cols(self, content): zipped_content = list(lib.to_object_array(content).T) - col_len = len(self.orig_names) + col_len = self.num_original_columns zip_len = len(zipped_content) if self._implicit_index: @@ -1655,6 +1712,7 @@ def _rows_to_cols(self, content): if self.skip_footer < 0: raise ValueError('skip footer cannot be negative') + # Loop through rows to verify lengths are correct. if col_len != zip_len and self.index_col is not False: i = 0 for (i, l) in enumerate(content): @@ -1671,6 +1729,11 @@ def _rows_to_cols(self, content): (col_len, row_num + 1, zip_len)) raise ValueError(msg) + if self.usecols: + if self._implicit_index: + zipped_content = [a for i, a in enumerate(zipped_content) if i < len(self.index_col) or i - len(self.index_col) in self._col_indices] + else: + zipped_content = [a for i, a in enumerate(zipped_content) if i in self._col_indices] return zipped_content def _get_lines(self, rows=None): diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index cf0c01c8dff50..b81feec6ab6f8 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -18,7 +18,7 @@ from pandas.compat import( StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) -from pandas.io.common import urlopen, URLError +from pandas.io.common import URLError import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, TextFileReader, TextParser) @@ -761,8 +761,6 @@ def test_deep_skiprows(self): condensed_data = self.read_csv(StringIO(condensed_text)) tm.assert_frame_equal(data, condensed_data) - - def test_detect_string_na(self): data = """A,B foo,bar @@ -1217,14 +1215,11 @@ def test_header_multi_index(self): R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - # basic test with both engines - for engine in ['c','python']: - df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False, - engine=engine) - tm.assert_frame_equal(df, expected) + df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + tm.assert_frame_equal(df, expected) # skipping lines in the header - df = read_csv(StringIO(data), header=[0,2,3,4],index_col=[0,1], tupleize_cols=False) + df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) #### invalid options #### @@ -1825,9 +1820,6 @@ def test_integer_overflow_bug(self): result = self.read_csv(StringIO(data), header=None, sep=' ') self.assertTrue(result[0].dtype == np.float64) - result = self.read_csv(StringIO(data), header=None, sep='\s+') - self.assertTrue(result[0].dtype == np.float64) - def test_int64_min_issues(self): # #2599 data = 'A,B\n0,0\n0,' @@ -1908,6 +1900,61 @@ def test_warn_if_chunks_have_mismatched_type(self): df = self.read_csv(StringIO(data)) self.assertEqual(df.a.dtype, np.object) + def test_usecols(self): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + result = self.read_csv(StringIO(data), usecols=(1, 2)) + result2 = self.read_csv(StringIO(data), usecols=('b', 'c')) + exp = self.read_csv(StringIO(data)) + + self.assertEquals(len(result.columns), 2) + self.assertTrue((result['b'] == exp['b']).all()) + self.assertTrue((result['c'] == exp['c']).all()) + + tm.assert_frame_equal(result, result2) + + result = self.read_csv(StringIO(data), usecols=[1, 2], header=0, + names=['foo', 'bar']) + expected = self.read_csv(StringIO(data), usecols=[1, 2]) + expected.columns = ['foo', 'bar'] + tm.assert_frame_equal(result, expected) + + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + result = self.read_csv(StringIO(data), names=['b', 'c'], + header=None, usecols=[1, 2]) + + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + expected = expected[['b', 'c']] + tm.assert_frame_equal(result, expected) + + result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None, usecols=['b', 'c']) + tm.assert_frame_equal(result2, result) + + # length conflict, passed names and usecols disagree + self.assertRaises(ValueError, self.read_csv, StringIO(data), + names=['a', 'b'], usecols=[1], header=None) + + def test_integer_overflow_bug(self): + # #2601 + data = "65248E10 11\n55555E55 22\n" + + result = self.read_csv(StringIO(data), header=None, sep=' ') + self.assertTrue(result[0].dtype == np.float64) + + result = self.read_csv(StringIO(data), header=None, sep='\s+') + self.assertTrue(result[0].dtype == np.float64) + class TestPythonParser(ParserTests, unittest.TestCase): def test_negative_skipfooter_raises(self): @@ -2360,6 +2407,9 @@ def test_parse_dates_empty_string(self): result = pd.read_csv(s, parse_dates=["Date"], na_filter=False) self.assertTrue(result['Date'].isnull()[1]) + def test_usecols(self): + raise nose.SkipTest("Usecols is not supported in C High Memory engine.") + class TestCParserLowMemory(ParserTests, unittest.TestCase): @@ -2406,51 +2456,6 @@ def test_pass_dtype(self): self.assert_(result['one'].dtype == 'u1') self.assert_(result['two'].dtype == 'S1') - def test_usecols(self): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - - result = self.read_csv(StringIO(data), usecols=(1, 2)) - result2 = self.read_csv(StringIO(data), usecols=('b', 'c')) - exp = self.read_csv(StringIO(data)) - - self.assertEquals(len(result.columns), 2) - self.assertTrue((result['b'] == exp['b']).all()) - self.assertTrue((result['c'] == exp['c']).all()) - - tm.assert_frame_equal(result, result2) - - result = self.read_csv(StringIO(data), usecols=[1, 2], header=0, - names=['foo', 'bar']) - expected = self.read_csv(StringIO(data), usecols=[1, 2]) - expected.columns = ['foo', 'bar'] - tm.assert_frame_equal(result, expected) - - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - result = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None, usecols=[1, 2]) - - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - expected = expected[['b', 'c']] - tm.assert_frame_equal(result, expected) - - result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None, usecols=['b', 'c']) - tm.assert_frame_equal(result2, result) - - # length conflict, passed names and usecols disagree - self.assertRaises(ValueError, self.read_csv, StringIO(data), - names=['a', 'b'], usecols=[1], header=None) - def test_usecols_dtypes(self): data = """\ 1,2,3 @@ -2496,12 +2501,11 @@ def test_usecols_regex_sep(self): # #2733 data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' - self.assertRaises(Exception, self.read_csv, StringIO(data), - sep='\s+', usecols=('a', 'b')) + df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b')) - # expected = DataFrame({'a': ['apple', 'orange'], - # 'b': ['bat', 'cow']}, index=[4, 8]) - # tm.assert_frame_equal(result, expected) + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + tm.assert_frame_equal(df, expected) def test_pure_python_failover(self): data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"