diff --git a/doc/source/io.rst b/doc/source/io.rst index 6840717854dea..90bb762f1a1ba 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -890,6 +890,22 @@ of tupleizing columns, specify ``tupleize_cols=True``. print(open('mi.csv').read()) pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1]) +Starting in 0.13.0, ``read_csv`` will be able to interpret a more common format +of multi-columns indices. + +.. ipython:: python + :suppress: + + data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12" + fh = open('mi2.csv','w') + fh.write(data) + fh.close() + +.. ipython:: python + + print(open('mi2.csv').read()) + pd.read_csv('mi2.csv',header=[0,1],index_col=0) + Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will be *lost*. @@ -898,6 +914,7 @@ with ``df.to_csv(..., index=False``), then any ``names`` on the columns index wi import os os.remove('mi.csv') + os.remove('mi2.csv') .. _io.sniff: @@ -1069,7 +1086,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet Orient Options ++++++++++++++ -There are a number of different options for the format of the resulting JSON +There are a number of different options for the format of the resulting JSON file / string. Consider the following DataFrame and Series: .. ipython:: python @@ -1080,7 +1097,7 @@ file / string. Consider the following DataFrame and Series: sjo = Series(dict(x=15, y=16, z=17), name='D') sjo -**Column oriented** (the default for ``DataFrame``) serialises the data as +**Column oriented** (the default for ``DataFrame``) serialises the data as nested JSON objects with column labels acting as the primary index: .. ipython:: python @@ -1113,7 +1130,7 @@ values only, column and index labels are not included: dfjo.to_json(orient="values") # Not available for Series -**Split oriented** serialises to a JSON object containing separate entries for +**Split oriented** serialises to a JSON object containing separate entries for values, index and columns. Name is also included for ``Series``: .. ipython:: python @@ -1123,7 +1140,7 @@ values, index and columns. Name is also included for ``Series``: .. note:: - Any orient option that encodes to a JSON object will not preserve the ordering of + Any orient option that encodes to a JSON object will not preserve the ordering of index and column labels during round-trip serialisation. If you wish to preserve label ordering use the `split` option as it uses ordered containers. @@ -1351,7 +1368,7 @@ The Numpy Parameter If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff an appropriate dtype during deserialisation and to subsequently decode directly -to numpy arrays, bypassing the need for intermediate Python objects. +to numpy arrays, bypassing the need for intermediate Python objects. This can provide speedups if you are deserialising a large amount of numeric data: @@ -1375,7 +1392,7 @@ data: The speedup is less noticable for smaller datasets: .. ipython:: python - + jsonfloats = dffloats.head(100).to_json() .. ipython:: python @@ -1399,7 +1416,7 @@ The speedup is less noticable for smaller datasets: - labels are ordered. Labels are only read from the first container, it is assumed that each subsequent row / column has been encoded in the same order. This should be satisfied if the - data was encoded using ``to_json`` but may not be the case if the JSON + data was encoded using ``to_json`` but may not be the case if the JSON is from another source. .. ipython:: python diff --git a/doc/source/release.rst b/doc/source/release.rst index f161ead7f7ecc..b3fa90ed6f624 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -634,6 +634,8 @@ Bug Fixes - Fixed seg fault in C parser caused by passing more names than columns in the file. (:issue:`5156`) - Fix ``Series.isin`` with date/time-like dtypes (:issue:`5021`) + - C and Python Parser can now handle the more common multi-index column format + which doesn't have a row for index names (:issue:`4702`) pandas 0.12.0 ------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e9e82824326a7..c10cb84de34fd 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -569,7 +569,6 @@ def _clean_options(self, options, engine): skiprows = set() if skiprows is None else set(skiprows) # put stuff back - result['index_col'] = index_col result['names'] = names result['converters'] = converters result['na_values'] = na_values @@ -641,7 +640,7 @@ def __init__(self, kwds): self.orig_names = None self.prefix = kwds.pop('prefix', None) - self.index_col = kwds.pop('index_col', None) + self.index_col = kwds.get('index_col', None) self.index_names = None self.col_names = None @@ -1455,6 +1454,7 @@ def _convert_data(self, data): def _infer_columns(self): names = self.names num_original_columns = 0 + clear_buffer = True if self.header is not None: header = self.header @@ -1473,6 +1473,7 @@ def _infer_columns(self): while self.pos <= hr: line = self._next_line() + unnamed_count = 0 this_columns = [] for i, c in enumerate(line): if c == '': @@ -1480,6 +1481,7 @@ def _infer_columns(self): this_columns.append('Unnamed: %d_level_%d' % (i, level)) else: this_columns.append('Unnamed: %d' % i) + unnamed_count += 1 else: this_columns.append(c) @@ -1490,12 +1492,25 @@ def _infer_columns(self): if cur_count > 0: this_columns[i] = '%s.%d' % (col, cur_count) counts[col] = cur_count + 1 + elif have_mi_columns: + + # if we have grabbed an extra line, but its not in our format + # so save in the buffer, and create an blank extra line for the rest of the + # parsing code + if hr == header[-1]: + lc = len(this_columns) + ic = len(self.index_col) if self.index_col is not None else 0 + if lc != unnamed_count and lc-ic > unnamed_count: + clear_buffer = False + this_columns = [ None ] * lc + self.buf = [ self.buf[-1] ] columns.append(this_columns) if len(columns) == 1: num_original_columns = len(this_columns) - self._clear_buffer() + if clear_buffer: + self._clear_buffer() if names is not None: if (self.usecols is not None and len(names) != len(self.usecols)) \ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 99a6c630e6ac4..66730f255eb1d 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1215,29 +1215,113 @@ def test_header_multi_index(self): R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # skipping lines in the header - df = read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) #### invalid options #### # no as_recarray - self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], index_col=[0,1], as_recarray=True, tupleize_cols=False) # names - self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], index_col=[0,1], names=['foo','bar'], tupleize_cols=False) # usecols - self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False) # non-numeric index_col - self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], index_col=['foo','bar'], tupleize_cols=False) + def test_header_multiindex_common_format(self): + + df = DataFrame([[1,2,3,4,5,6],[7,8,9,10,11,12]], + index=['one','two'], + columns=MultiIndex.from_tuples([('a','q'),('a','r'),('a','s'), + ('b','t'),('c','u'),('c','v')])) + + # to_csv + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +,,,,,, +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(df,result) + + # common + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(df,result) + + # common, no index_col + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=None) + tm.assert_frame_equal(df.reset_index(drop=True),result) + + # malformed case 1 + expected = DataFrame(np.array([[ 2, 3, 4, 5, 6], + [ 8, 9, 10, 11, 12]]), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[u('a'), u('q')])) + + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(expected,result) + + # malformed case 2 + expected = DataFrame(np.array([[ 2, 3, 4, 5, 6], + [ 8, 9, 10, 11, 12]]), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(expected,result) + + # mi on columns and index (malformed) + expected = DataFrame(np.array([[ 3, 4, 5, 6], + [ 9, 10, 11, 12]]), + index=MultiIndex(levels=[[1, 7], [2, 8]], + labels=[[0, 1], [0, 1]]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]], + labels=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=[0, 1]) + tm.assert_frame_equal(expected,result) + def test_pass_names_with_index(self): lines = self.data1.split('\n') no_header = '\n'.join(lines[1:]) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 06a1ddfdae025..36b4b91023a73 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -250,6 +250,7 @@ cdef class TextReader: object memory_map object as_recarray object header, orig_header, names, header_start, header_end + object index_col object low_memory object skiprows object compact_ints, use_unsigned @@ -266,6 +267,7 @@ cdef class TextReader: header=0, header_start=0, header_end=0, + index_col=None, names=None, memory_map=False, @@ -439,6 +441,8 @@ cdef class TextReader: # XXX self.noconvert = set() + self.index_col = index_col + #---------------------------------------- # header stuff @@ -574,7 +578,7 @@ cdef class TextReader: # header is now a list of lists, so field_count should use header[0] cdef: - size_t i, start, data_line, field_count, passed_count, hr + size_t i, start, data_line, field_count, passed_count, hr, unnamed_count char *word object name int status @@ -606,6 +610,7 @@ cdef class TextReader: # TODO: Py3 vs. Py2 counts = {} + unnamed_count = 0 for i in range(field_count): word = self.parser.words[start + i] @@ -623,6 +628,7 @@ cdef class TextReader: name = 'Unnamed: %d_level_%d' % (i,level) else: name = 'Unnamed: %d' % i + unnamed_count += 1 count = counts.get(name, 0) if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: @@ -631,6 +637,19 @@ cdef class TextReader: this_header.append(name) counts[name] = count + 1 + if self.has_mi_columns: + + # if we have grabbed an extra line, but its not in our format + # so save in the buffer, and create an blank extra line for the rest of the + # parsing code + if hr == self.header[-1]: + lc = len(this_header) + ic = len(self.index_col) if self.index_col is not None else 0 + if lc != unnamed_count and lc-ic > unnamed_count: + hr -= 1 + self.parser_start -= 1 + this_header = [ None ] * lc + data_line = hr + 1 header.append(this_header) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index f6db680d30061..7ae537b0b94df 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5348,15 +5348,19 @@ def test_to_csv_moar(self): def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None, dupe_col=False): + kwargs = dict(parse_dates=False) if cnlvl: - header = lrange(cnlvl) + if rnlvl is not None: + kwargs['index_col'] = lrange(rnlvl) + kwargs['header'] = lrange(cnlvl) with ensure_clean(path) as path: df.to_csv(path,encoding='utf8',chunksize=chunksize,tupleize_cols=False) - recons = DataFrame.from_csv(path,header=lrange(cnlvl),tupleize_cols=False,parse_dates=False) + recons = DataFrame.from_csv(path,tupleize_cols=False,**kwargs) else: + kwargs['header'] = 0 with ensure_clean(path) as path: df.to_csv(path,encoding='utf8',chunksize=chunksize) - recons = DataFrame.from_csv(path,header=0,parse_dates=False) + recons = DataFrame.from_csv(path,**kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): @@ -5366,7 +5370,7 @@ def _to_uni(x): # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns - if rnlvl: + if rnlvl and not cnlvl: delta_lvl = [recons.icol(i).values for i in range(rnlvl-1)] ix=MultiIndex.from_arrays([list(recons.index)]+delta_lvl) recons.index = ix @@ -5417,7 +5421,7 @@ def _to_uni(x): recons.columns = np.array(recons.columns,dtype=c_dtype ) df.columns = np.array(df.columns,dtype=c_dtype ) - assert_frame_equal(df, recons,check_names=False,check_less_precise=True) + assert_frame_equal(df,recons,check_names=False,check_less_precise=True) N = 100 chunksize=1000 @@ -5476,7 +5480,7 @@ def make_dtnat_arr(n,nnat=None): base = int((chunksize// ncols or 1) or 1) for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, base-1,base,base+1]: - print( nrows,ncols) + #print( nrows,ncols) _do_test(mkdf(nrows, ncols),path) for nrows in [10,N-2,N-1,N,N+1,N+2]: @@ -5498,7 +5502,7 @@ def make_dtnat_arr(n,nnat=None): base = int(chunksize//ncols) for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, base-1,base,base+1]: - print(nrows, ncols) + #print(nrows, ncols) _do_test(mkdf(nrows, ncols,r_idx_nlevels=2),path,rnlvl=2) _do_test(mkdf(nrows, ncols,c_idx_nlevels=2),path,cnlvl=2) _do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2), @@ -5615,11 +5619,8 @@ def _make_frame(names=None): # dup column names? df = mkdf(5,3,r_idx_nlevels=3,c_idx_nlevels=4) df.to_csv(path,tupleize_cols=False) - result = read_csv(path,header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) - result.columns = ['R2','A','B','C'] - new_result = result.reset_index().set_index(['R0','R1','R2']) - new_result.columns = df.columns - assert_frame_equal(df,new_result) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1,2],tupleize_cols=False) + assert_frame_equal(df,result) # writing with no index df = _make_frame() @@ -9881,7 +9882,7 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, if not ('max' in name or 'min' in name or 'count' in name): df = DataFrame({'b': date_range('1/1/2001', periods=2)}) _f = getattr(df, name) - print(df) + #print(df) self.assertFalse(len(_f())) df['a'] = lrange(len(df)) @@ -11786,7 +11787,7 @@ def to_series(mi, level): if isinstance(v, Index): assert v.is_(expected[k]) elif isinstance(v, Series): - print(k) + #print(k) tm.assert_series_equal(v, expected[k]) else: raise AssertionError("object must be a Series or Index")