diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d41c02ad4200..0edd6a0bc7b9d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1021,15 +1021,17 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, - index=True, index_label=None): + def _helper_csvexcel(self, writer, na_rep=None, cols=None, + header=True, index=True, index_label=None): if cols is None: cols = self.columns series = {} for k, v in self._series.iteritems(): series[k] = v.values - if header: + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: if index: # should write something for index label if index_label is None: @@ -1050,7 +1052,15 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, index_label = [index_label] encoded_labels = list(index_label) - encoded_cols = list(cols) + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) writer.writerow(encoded_labels + encoded_cols) else: @@ -1075,8 +1085,8 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, writer.writerow(row_fields) def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, - header=True, index=True, index_label=None, mode='w', - nanRep=None, encoding=None): + header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None): """ Write DataFrame to a comma-separated values (csv) file @@ -1088,8 +1098,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, Missing data representation cols : sequence, optional Columns to write - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None @@ -1125,6 +1136,7 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, header=header, index=index, index_label=index_label) + finally: if close: f.close() @@ -1144,8 +1156,9 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', Missing data rep'n cols : sequence, optional Columns to write - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8261b889b4500..3b2fedd65b45f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1029,9 +1029,9 @@ def _convert_types(values, na_values): return values, na_count try: - result = lib.maybe_convert_numeric(values, na_values) + result = lib.maybe_convert_numeric(values, na_values, False) except Exception: - na_count = lib.sanitize_objects(values, na_values) + na_count = lib.sanitize_objects(values, na_values, False) result = values if result.dtype == np.object_: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7a75e687a4a8a..1f14df528af8f 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -52,6 +52,32 @@ def setUp(self): self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + def test_empty_string(self): + data = """\ +One,Two,Three +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = read_csv(StringIO(data)) + xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = read_csv(StringIO(data), na_values={'One': [], 'Three': []}) + xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', 'nan', 'five', + '', 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_read_csv(self): pass diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 4057d240fceb6..451b998b0481b 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -241,7 +241,8 @@ def is_date_array(ndarray[object] values): return True -def maybe_convert_numeric(ndarray[object] values, set na_values): +def maybe_convert_numeric(ndarray[object] values, set na_values, + convert_empty=True): ''' Type inference function-- convert strings to numeric (potentially) and convert to proper dtype array @@ -275,8 +276,11 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): floats[i] = complexes[i] = nan seen_float = 1 elif len(val) == 0: - floats[i] = complexes[i] = nan - seen_float = 1 + if convert_empty: + floats[i] = complexes[i] = nan + seen_float = 1 + else: + raise ValueError('Empty string encountered') elif util.is_complex_object(val): complexes[i] = val seen_complex = 1 @@ -573,7 +577,8 @@ def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, return result -def sanitize_objects(ndarray[object] values, set na_values): +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): cdef: Py_ssize_t i, n object val, onan @@ -585,7 +590,7 @@ def sanitize_objects(ndarray[object] values, set na_values): for i from 0 <= i < n: val = values[i] - if val == '' or val in na_values: + if (convert_empty and val == '') or (val in na_values): values[i] = onan na_count += 1 elif val in memo: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1906c3936292d..5d88b671b0961 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3078,6 +3078,19 @@ def test_to_csv_from_csv(self): result = DataFrame.from_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_csv(path, header=col_aliases) + rs = DataFrame.from_csv(path) + xp = self.frame2.copy() + xp.columns = col_aliases + + assert_frame_equal(xp, rs) + + self.assertRaises(ValueError, self.frame2.to_csv, path, + header=['AA', 'X']) + os.remove(path) def test_to_csv_multiindex(self): @@ -3262,6 +3275,14 @@ def test_to_excel_from_excel(self): np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(path, 'test1', header=col_aliases) + reader = ExcelFile(path) + rs = reader.parse('test1', index_col=0) + xp = self.frame2.copy() + xp.columns = col_aliases + assert_frame_equal(xp, rs) os.remove(path)