Merge pull request #5414 from patricktokeeffe/pass-thru-to_csv-params

jreback · jreback · commit 973e0769da58 · 2014-02-17T09:19:06.000-05:00
ENH/BUG: pass formatting params thru to `to_csv`
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1036,20 +1036,29 @@ The Series and DataFrame objects have an instance method ``to_csv`` which
 allows storing the contents of the object as a comma-separated-values file. The
 function takes a number of arguments. Only the first is required.
 
-  - ``path``: A string path to the file to write
+  - ``path_or_buf``: A string path to the file to write or a StringIO
+  - ``sep`` : Field delimiter for the output file (default ",")
   - ``na_rep``: A string representation of a missing value (default '')
+  - ``float_format``: Format string for floating point numbers
   - ``cols``: Columns to write (default None)
   - ``header``: Whether to write out the column names (default True)
   - ``index``: whether to write row (index) names (default True)
   - ``index_label``: Column label(s) for index column(s) if desired. If None
     (default), and `header` and `index` are True, then the index names are
     used. (A sequence should be given if the DataFrame uses MultiIndex).
   - ``mode`` : Python write mode, default 'w'
-  - ``sep`` : Field delimiter for the output file (default ",")
   - ``encoding``: a string representing the encoding to use if the contents are
     non-ascii, for python versions prior to 3
-  - ``tupleize_cols``: boolean, default False, if False, write as a list of tuples,
-    otherwise write in an expanded line format suitable for ``read_csv``
+  - ``line_terminator``: Character sequence denoting line end (default '\\n')
+  - ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL)
+  - ``quotechar``: Character used to quote fields (default '"')
+  - ``doublequote``: Control quoting of ``quotechar`` in fields (default True)
+  - ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when
+    appropriate (default None)
+  - ``chunksize``: Number of rows to write at a time
+  - ``tupleize_cols``: If False (default), write as a list of tuples, otherwise
+    write in an expanded line format suitable for ``read_csv``
+  - ``date_format``: Format string for datetime objects
 
 Writing a formatted string
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -86,7 +86,8 @@ Improvements to existing features
 - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
 - Testing statements updated to use specialized asserts (:issue: `6175`)
 - ``Series.rank()`` now has a percentage rank option (:issue: `5971`)
-
+- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
+  using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -179,6 +179,9 @@ Enhancements
 
      household.join(portfolio, how='inner')
 
+- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
+  using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
+
 Performance
 ~~~~~~~~~~~
 
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -947,7 +947,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
                  cols=None, header=True, index=True, index_label=None,
                  mode='w', nanRep=None, encoding=None, quoting=None,
                  line_terminator='\n', chunksize=None, engine=None,
-                 tupleize_cols=False, quotechar='"', date_format=None):
+                 tupleize_cols=False, quotechar='"', date_format=None,
+                 doublequote=True, escapechar=None):
 
         self.engine = engine  # remove for 0.13
         self.obj = obj
@@ -972,6 +973,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
             quotechar = None
         self.quotechar = quotechar
 
+        self.doublequote = doublequote
+        self.escapechar = escapechar
+
         self.line_terminator = line_terminator
 
         self.date_format = date_format
@@ -1151,6 +1155,8 @@ def save(self):
         try:
             writer_kwargs = dict(lineterminator=self.line_terminator,
                                  delimiter=self.sep, quoting=self.quoting,
+                                 doublequote=self.doublequote,
+                                 escapechar=self.escapechar,
                                  quotechar=self.quotechar)
             if self.encoding is not None:
                 writer_kwargs['encoding'] = self.encoding
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1070,8 +1070,9 @@ def to_panel(self):
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n', chunksize=None,
-               tupleize_cols=False, date_format=None, **kwds):
+               quotechar='"', line_terminator='\n', chunksize=None,
+               tupleize_cols=False, date_format=None, doublequote=True,
+               escapechar=None, **kwds):
         r"""Write DataFrame to a comma-separated values (csv) file
 
         Parameters
@@ -1109,13 +1110,19 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
             file
         quoting : optional constant from csv module
             defaults to csv.QUOTE_MINIMAL
+        quotechar : string (length 1), default '"'
+            character used to quote fields
+        doublequote : boolean, default True
+            Control quoting of `quotechar` inside a field
+        escapechar : string (length 1), default None
+            character used to escape `sep` and `quotechar` when appropriate
         chunksize : int or None
             rows to write at a time
         tupleize_cols : boolean, default False
             write multi_index columns as a list of tuples (if True)
             or new (expanded format) if False)
         date_format : string, default None
-            Format string for datetime objects.
+            Format string for datetime objects
         """
         if nanRep is not None:  # pragma: no cover
             warnings.warn("nanRep is deprecated, use na_rep",
@@ -1129,10 +1136,12 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                                      float_format=float_format, cols=cols,
                                      header=header, index=index,
                                      index_label=index_label, mode=mode,
-                                     chunksize=chunksize, engine=kwds.get(
-                                         "engine"),
+                                     chunksize=chunksize, quotechar=quotechar,
+                                     engine=kwds.get("engine"),
                                      tupleize_cols=tupleize_cols,
-                                     date_format=date_format)
+                                     date_format=date_format,
+                                     doublequote=doublequote,
+                                     escapechar=escapechar)
         formatter.save()
 
     def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -46,7 +46,8 @@
     Default (None) results in QUOTE_MINIMAL behavior.
 skipinitialspace : boolean, default False
     Skip spaces after delimiter
-escapechar : string
+escapechar : string (length 1), default None
+    One-character string used to escape delimiter when quoting is QUOTE_NONE.
 dtype : Type name or dict of column -> type
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
 compression : {'gzip', 'bz2', None}, default None
diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
@@ -1669,10 +1669,10 @@ def test_to_latex(self):
 \end{tabular}
 """
         self.assertEqual(withoutindex_result, withoutindex_expected)
-        
+
     def test_to_latex_escape_special_chars(self):
         special_characters = ['&','%','$','#','_',
-                               '{','}','~','^','\\'] 
+                               '{','}','~','^','\\']
         df = DataFrame(data=special_characters)
         observed = df.to_latex()
         expected = r"""\begin{tabular}{ll}
@@ -1694,6 +1694,99 @@ def test_to_latex_escape_special_chars(self):
 """
         self.assertEqual(observed, expected)
 
+    def test_to_csv_quotechar(self):
+        df = DataFrame({'col' : [1,2]})
+        expected = """\
+"","col"
+"0","1"
+"1","2"
+"""
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1) # 1=QUOTE_ALL
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, engine='python')
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+
+        expected = """\
+$$,$col$
+$0$,$1$
+$1$,$2$
+"""
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, quotechar="$")
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, quotechar="$", engine='python')
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+
+        with tm.ensure_clean('test.csv') as path:
+            with tm.assertRaisesRegexp(TypeError, 'quotechar'):
+                df.to_csv(path, quoting=1, quotechar=None)
+        with tm.ensure_clean('test.csv') as path:
+            with tm.assertRaisesRegexp(TypeError, 'quotechar'):
+                df.to_csv(path, quoting=1, quotechar=None, engine='python')
+
+    def test_to_csv_doublequote(self):
+        df = DataFrame({'col' : ['a"a', '"bb"']})
+        expected = '''\
+"","col"
+"0","a""a"
+"1","""bb"""
+'''
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, doublequote=True, engine='python')
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+
+        from _csv import Error
+        with tm.ensure_clean('test.csv') as path:
+            with tm.assertRaisesRegexp(Error, 'escapechar'):
+                df.to_csv(path, doublequote=False) # no escapechar set
+        with tm.ensure_clean('test.csv') as path:
+            with tm.assertRaisesRegexp(Error, 'escapechar'):
+                df.to_csv(path, doublequote=False, engine='python')
+
+    def test_to_csv_escapechar(self):
+        df = DataFrame({'col' : ['a"a', '"bb"']})
+        expected = """\
+"","col"
+"0","a\\"a"
+"1","\\"bb\\""
+"""
+        with tm.ensure_clean('test.csv') as path:   # QUOTE_ALL
+            df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=1, doublequote=False, escapechar='\\',
+                      engine='python')
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+
+        df = DataFrame({'col' : ['a,a', ',bb,']})
+        expected = """\
+,col
+0,a\\,a
+1,\\,bb\\,
+"""
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+        with tm.ensure_clean('test.csv') as path:
+            df.to_csv(path, quoting=3, escapechar='\\', engine='python')
+            with open(path, 'r') as f:
+                self.assertEqual(f.read(), expected)
+
 class TestSeriesFormatting(tm.TestCase):
     _multiprocess_can_split_ = True