BUG: print DataFrame columns in the right order, also convert NAs in string columns, GH #325

wesm · wesm · commit 6a0452b50ae7 · 2011-11-04T20:31:02.000-04:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2890,7 +2890,11 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
         self.formatters = formatters
         self.na_rep = na_rep
         self.col_space = col_space
-        self.column_filter = frame.columns if columns is None else set(columns)
+
+        if columns is not None:
+            self.columns = _ensure_index(columns)
+        else:
+            self.columns = frame.columns
 
         self._write_to_buffer()
 
@@ -2909,8 +2913,7 @@ def _write_to_buffer(self):
             str_columns = self._get_formatted_column_labels()
 
             stringified = [str_columns[i] + format_col(c)
-                           for i, c in enumerate(frame.columns)
-                           if c in self.column_filter]
+                           for i, c in enumerate(self.columns)]
 
             to_write.append(adjoin(1, str_index, *stringified))
 
@@ -2946,18 +2949,16 @@ def _format_col(col):
     def _get_formatted_column_labels(self):
         from pandas.core.index import _sparsify
 
-        columns = self.frame.columns
-
-        if isinstance(columns, MultiIndex):
-            fmt_columns = columns.format(sparsify=False, adjoin=False)
+        if isinstance(self.columns, MultiIndex):
+            fmt_columns = self.columns.format(sparsify=False, adjoin=False)
             str_columns = zip(*[[' %s' % y for y in x]
                                 for x in zip(*fmt_columns)])
             if self.sparsify:
                 str_columns = _sparsify(str_columns)
 
             str_columns = [list(x) for x in zip(*str_columns)]
         else:
-            str_columns = [[' %s' % x] for x in columns.format()]
+            str_columns = [[' %s' % x] for x in self.columns.format()]
 
         if self.show_index_names and self.has_index_names:
             for x in str_columns:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -424,7 +424,7 @@ def _convert_types(values, na_values):
     try:
         values = lib.maybe_convert_numeric(values, na_values)
     except Exception:
-        lib.sanitize_objects(values)
+        lib.sanitize_objects(values, na_values)
 
     if values.dtype == np.object_:
         return lib.maybe_convert_bool(values)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -51,6 +51,19 @@ def test_custom_na_values(self):
                          skiprows=[1])
         assert_almost_equal(df2.values, expected)
 
+    def test_detect_string_na(self):
+        data = """A,B
+foo,bar
+NA,baz
+NaN,nan
+"""
+        expected = [['foo', 'bar'],
+                    [nan, 'baz'],
+                    [nan, nan]]
+
+        df = read_csv(StringIO(data))
+        assert_almost_equal(df.values, expected)
+
     def test_unnamed_columns(self):
         data = """A,B,C,,
 1,2,3,4,5
diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx
@@ -190,7 +190,7 @@ def try_parse_dates(ndarray[object] values, parser=None):
 
     return result
 
-def sanitize_objects(ndarray[object] values):
+def sanitize_objects(ndarray[object] values, set na_values):
     cdef:
         Py_ssize_t i, n
         object val, onan
@@ -200,7 +200,7 @@ def sanitize_objects(ndarray[object] values):
 
     for i from 0 <= i < n:
         val = values[i]
-        if val == '':
+        if val == '' or val in na_values:
             values[i] = onan
 
 def maybe_convert_bool(ndarray[object] arr):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -1324,6 +1324,9 @@ def test_repr_corner(self):
         foo = repr(df)
 
     def test_to_string(self):
+        from pandas import read_table
+        import re
+
         # big mixed
         biggie = DataFrame({'A' : randn(1000),
                              'B' : tm.makeStringIndex(1000)},
@@ -1340,7 +1343,25 @@ def test_to_string(self):
 
         self.assert_(isinstance(s, basestring))
 
-        biggie.to_string(columns=['B', 'A'], colSpace=17)
+        # print in right order
+        result = biggie.to_string(columns=['B', 'A'], colSpace=17,
+                                  float_format='%.6f'.__mod__)
+        lines = result.split('\n')
+        header = lines[0].strip().split()
+        joined = '\n'.join([re.sub('\s+', ' ', x).strip() for x in lines[1:]])
+        recons = read_table(StringIO(joined), names=header, sep=' ')
+        assert_series_equal(recons['B'], biggie['B'])
+        assert_series_equal(np.round(recons['A'], 2),
+                            np.round(biggie['A'], 2))
+
+        # expected = ['B', 'A']
+        # self.assertEqual(header, expected)
+
+        result = biggie.to_string(columns=['A'], colSpace=17)
+        header = result.split('\n')[0].strip().split()
+        expected = ['A']
+        self.assertEqual(header, expected)
+
         biggie.to_string(columns=['B', 'A'],
                          formatters={'A' : lambda x: '%.1f' % x})