From 4f7571589dd15ef98a3bd241484ff32bea73c9d1 Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 19:10:13 -0400 Subject: [PATCH 1/3] ENH: option to disable empty string conversion to NaN in file readers --- pandas/io/parsers.py | 4 ++-- pandas/io/tests/test_parsers.py | 26 ++++++++++++++++++++++++++ pandas/src/inference.pyx | 15 ++++++++++----- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8261b889b4500..3b2fedd65b45f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1029,9 +1029,9 @@ def _convert_types(values, na_values): return values, na_count try: - result = lib.maybe_convert_numeric(values, na_values) + result = lib.maybe_convert_numeric(values, na_values, False) except Exception: - na_count = lib.sanitize_objects(values, na_values) + na_count = lib.sanitize_objects(values, na_values, False) result = values if result.dtype == np.object_: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7a75e687a4a8a..1f14df528af8f 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -52,6 +52,32 @@ def setUp(self): self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + def test_empty_string(self): + data = """\ +One,Two,Three +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = read_csv(StringIO(data)) + xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = read_csv(StringIO(data), na_values={'One': [], 'Three': []}) + xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', 'nan', 'five', + '', 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_read_csv(self): pass diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 4057d240fceb6..451b998b0481b 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -241,7 +241,8 @@ def is_date_array(ndarray[object] values): return True -def maybe_convert_numeric(ndarray[object] values, set na_values): +def maybe_convert_numeric(ndarray[object] values, set na_values, + convert_empty=True): ''' Type inference function-- convert strings to numeric (potentially) and convert to proper dtype array @@ -275,8 +276,11 @@ def maybe_convert_numeric(ndarray[object] values, set na_values): floats[i] = complexes[i] = nan seen_float = 1 elif len(val) == 0: - floats[i] = complexes[i] = nan - seen_float = 1 + if convert_empty: + floats[i] = complexes[i] = nan + seen_float = 1 + else: + raise ValueError('Empty string encountered') elif util.is_complex_object(val): complexes[i] = val seen_complex = 1 @@ -573,7 +577,8 @@ def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, return result -def sanitize_objects(ndarray[object] values, set na_values): +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): cdef: Py_ssize_t i, n object val, onan @@ -585,7 +590,7 @@ def sanitize_objects(ndarray[object] values, set na_values): for i from 0 <= i < n: val = values[i] - if val == '' or val in na_values: + if (convert_empty and val == '') or (val in na_values): values[i] = onan na_count += 1 elif val in memo: From 4b1f9d2d69a97a06922f9fd55790ca3ed0dad598 Mon Sep 17 00:00:00 2001 From: Chang She Date: Sun, 24 Jun 2012 19:39:08 -0400 Subject: [PATCH 2/3] ENH: column aliases for to_csv/to_excel #921 --- pandas/core/frame.py | 31 ++++++++++++++++++++++--------- pandas/tests/test_frame.py | 19 +++++++++++++++++++ 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d41c02ad4200..8e2e48b26fce8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1021,7 +1021,8 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, + def _helper_csvexcel(self, writer, na_rep=None, cols=None, + col_aliases=None, header=True, index=True, index_label=None): if cols is None: cols = self.columns @@ -1050,7 +1051,15 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, index_label = [index_label] encoded_labels = list(index_label) - encoded_cols = list(cols) + if col_aliases is not None: + if len(col_aliases) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(col_aliases)))) + else: + write_cols = col_aliases + else: + write_cols = cols + encoded_cols = list(write_cols) writer.writerow(encoded_labels + encoded_cols) else: @@ -1075,8 +1084,8 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, header=True, writer.writerow(row_fields) def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, - header=True, index=True, index_label=None, mode='w', - nanRep=None, encoding=None): + col_aliases=None, header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None): """ Write DataFrame to a comma-separated values (csv) file @@ -1088,6 +1097,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, Missing data representation cols : sequence, optional Columns to write + col_aliases : sequence, default None + Optional column aliases to be written instead of column names header : boolean, default True Write out column names index : boolean, default True @@ -1123,14 +1134,16 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, else: csvout = csv.writer(f, lineterminator='\n', delimiter=sep) self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, - header=header, index=index, - index_label=index_label) + col_aliases=col_aliases, header=header, + index=index, index_label=index_label) + finally: if close: f.close() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', - cols=None, header=True, index=True, index_label=None): + cols=None, col_aliases=None, header=True, index=True, + index_label=None): """ Write DataFrame to a excel sheet @@ -1170,8 +1183,8 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', need_save = True excel_writer.cur_sheet = sheet_name self._helper_csvexcel(excel_writer, na_rep=na_rep, cols=cols, - header=header, index=index, - index_label=index_label) + col_aliases=col_aliases, header=header, + index=index, index_label=index_label) if need_save: excel_writer.save() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1906c3936292d..72a2276f670b5 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3078,6 +3078,16 @@ def test_to_csv_from_csv(self): result = DataFrame.from_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_csv(path, header=True, col_aliases=col_aliases) + rs = DataFrame.from_csv(path) + xp = self.frame2.copy() + xp.columns = col_aliases + + assert_frame_equal(xp, rs) + os.remove(path) def test_to_csv_multiindex(self): @@ -3262,6 +3272,15 @@ def test_to_excel_from_excel(self): np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(path, 'test1', header=True, + col_aliases=col_aliases) + reader = ExcelFile(path) + rs = reader.parse('test1') + xp = self.frame2.copy() + xp.columns = col_aliases + assert_frame_equal(xp, rs) os.remove(path) From 32bfd7429709c8a731312e7020964407a9a9867b Mon Sep 17 00:00:00 2001 From: Chang She Date: Mon, 25 Jun 2012 10:07:12 -0400 Subject: [PATCH 3/3] overload header keyword instead of extra col_aliases keyword --- pandas/core/frame.py | 40 +++++++++++++++++++------------------- pandas/tests/test_frame.py | 10 ++++++---- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8e2e48b26fce8..0edd6a0bc7b9d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1022,15 +1022,16 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) def _helper_csvexcel(self, writer, na_rep=None, cols=None, - col_aliases=None, header=True, - index=True, index_label=None): + header=True, index=True, index_label=None): if cols is None: cols = self.columns series = {} for k, v in self._series.iteritems(): series[k] = v.values - if header: + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: if index: # should write something for index label if index_label is None: @@ -1051,12 +1052,12 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, index_label = [index_label] encoded_labels = list(index_label) - if col_aliases is not None: - if len(col_aliases) != len(cols): + if has_aliases: + if len(header) != len(cols): raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(col_aliases)))) + % (len(cols), len(header)))) else: - write_cols = col_aliases + write_cols = header else: write_cols = cols encoded_cols = list(write_cols) @@ -1084,7 +1085,7 @@ def _helper_csvexcel(self, writer, na_rep=None, cols=None, writer.writerow(row_fields) def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, - col_aliases=None, header=True, index=True, index_label=None, + header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None): """ Write DataFrame to a comma-separated values (csv) file @@ -1097,10 +1098,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, Missing data representation cols : sequence, optional Columns to write - col_aliases : sequence, default None - Optional column aliases to be written instead of column names - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None @@ -1134,16 +1134,15 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, else: csvout = csv.writer(f, lineterminator='\n', delimiter=sep) self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, - col_aliases=col_aliases, header=header, - index=index, index_label=index_label) + header=header, index=index, + index_label=index_label) finally: if close: f.close() def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', - cols=None, col_aliases=None, header=True, index=True, - index_label=None): + cols=None, header=True, index=True, index_label=None): """ Write DataFrame to a excel sheet @@ -1157,8 +1156,9 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', Missing data rep'n cols : sequence, optional Columns to write - header : boolean, default True - Write out column names + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names index : boolean, default True Write row names (index) index_label : string or sequence, default None @@ -1183,8 +1183,8 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', need_save = True excel_writer.cur_sheet = sheet_name self._helper_csvexcel(excel_writer, na_rep=na_rep, cols=cols, - col_aliases=col_aliases, header=header, - index=index, index_label=index_label) + header=header, index=index, + index_label=index_label) if need_save: excel_writer.save() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 72a2276f670b5..5d88b671b0961 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3081,13 +3081,16 @@ def test_to_csv_from_csv(self): # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_csv(path, header=True, col_aliases=col_aliases) + self.frame2.to_csv(path, header=col_aliases) rs = DataFrame.from_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) + self.assertRaises(ValueError, self.frame2.to_csv, path, + header=['AA', 'X']) + os.remove(path) def test_to_csv_multiindex(self): @@ -3274,10 +3277,9 @@ def test_to_excel_from_excel(self): # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_excel(path, 'test1', header=True, - col_aliases=col_aliases) + self.frame2.to_excel(path, 'test1', header=col_aliases) reader = ExcelFile(path) - rs = reader.parse('test1') + rs = reader.parse('test1', index_col=0) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs)