diff --git a/doc/source/io.rst b/doc/source/io.rst index 7b7b0e745872a..ac2cabe009694 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1036,8 +1036,10 @@ The Series and DataFrame objects have an instance method ``to_csv`` which allows storing the contents of the object as a comma-separated-values file. The function takes a number of arguments. Only the first is required. - - ``path``: A string path to the file to write + - ``path_or_buf``: A string path to the file to write or a StringIO + - ``sep`` : Field delimiter for the output file (default ",") - ``na_rep``: A string representation of a missing value (default '') + - ``float_format``: Format string for floating point numbers - ``cols``: Columns to write (default None) - ``header``: Whether to write out the column names (default True) - ``index``: whether to write row (index) names (default True) @@ -1045,11 +1047,18 @@ function takes a number of arguments. Only the first is required. (default), and `header` and `index` are True, then the index names are used. (A sequence should be given if the DataFrame uses MultiIndex). - ``mode`` : Python write mode, default 'w' - - ``sep`` : Field delimiter for the output file (default ",") - ``encoding``: a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 - - ``tupleize_cols``: boolean, default False, if False, write as a list of tuples, - otherwise write in an expanded line format suitable for ``read_csv`` + - ``line_terminator``: Character sequence denoting line end (default '\\n') + - ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL) + - ``quotechar``: Character used to quote fields (default '"') + - ``doublequote``: Control quoting of ``quotechar`` in fields (default True) + - ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when + appropriate (default None) + - ``chunksize``: Number of rows to write at a time + - ``tupleize_cols``: If False (default), write as a list of tuples, otherwise + write in an expanded line format suitable for ``read_csv`` + - ``date_format``: Format string for datetime objects Writing a formatted string ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 8b753abc83ca7..2440c8651006e 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -85,7 +85,8 @@ Improvements to existing features - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) - Testing statements updated to use specialized asserts (:issue: `6175`) - ``Series.rank()`` now has a percentage rank option (:issue: `5971`) - +- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when + using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 597067609bf7f..58ae5084c4827 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -179,6 +179,9 @@ Enhancements household.join(portfolio, how='inner') +- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when + using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) + Performance ~~~~~~~~~~~ diff --git a/pandas/core/format.py b/pandas/core/format.py index f452ee11ae84f..04413970440b9 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -947,7 +947,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None, engine=None, - tupleize_cols=False, quotechar='"', date_format=None): + tupleize_cols=False, quotechar='"', date_format=None, + doublequote=True, escapechar=None): self.engine = engine # remove for 0.13 self.obj = obj @@ -972,6 +973,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, quotechar = None self.quotechar = quotechar + self.doublequote = doublequote + self.escapechar = escapechar + self.line_terminator = line_terminator self.date_format = date_format @@ -1151,6 +1155,8 @@ def save(self): try: writer_kwargs = dict(lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, + doublequote=self.doublequote, + escapechar=self.escapechar, quotechar=self.quotechar) if self.encoding is not None: writer_kwargs['encoding'] = self.encoding diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9f9af187d21dd..e66e09624a04f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1070,8 +1070,9 @@ def to_panel(self): def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, - line_terminator='\n', chunksize=None, - tupleize_cols=False, date_format=None, **kwds): + quotechar='"', line_terminator='\n', chunksize=None, + tupleize_cols=False, date_format=None, doublequote=True, + escapechar=None, **kwds): r"""Write DataFrame to a comma-separated values (csv) file Parameters @@ -1109,13 +1110,19 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, file quoting : optional constant from csv module defaults to csv.QUOTE_MINIMAL + quotechar : string (length 1), default '"' + character used to quote fields + doublequote : boolean, default True + Control quoting of `quotechar` inside a field + escapechar : string (length 1), default None + character used to escape `sep` and `quotechar` when appropriate chunksize : int or None rows to write at a time tupleize_cols : boolean, default False write multi_index columns as a list of tuples (if True) or new (expanded format) if False) date_format : string, default None - Format string for datetime objects. + Format string for datetime objects """ if nanRep is not None: # pragma: no cover warnings.warn("nanRep is deprecated, use na_rep", @@ -1129,10 +1136,12 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, float_format=float_format, cols=cols, header=header, index=index, index_label=index_label, mode=mode, - chunksize=chunksize, engine=kwds.get( - "engine"), + chunksize=chunksize, quotechar=quotechar, + engine=kwds.get("engine"), tupleize_cols=tupleize_cols, - date_format=date_format) + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar) formatter.save() def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6b0d56b5c383e..e7d9145aa9d68 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -46,7 +46,8 @@ Default (None) results in QUOTE_MINIMAL behavior. skipinitialspace : boolean, default False Skip spaces after delimiter -escapechar : string +escapechar : string (length 1), default None + One-character string used to escape delimiter when quoting is QUOTE_NONE. dtype : Type name or dict of column -> type Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} compression : {'gzip', 'bz2', None}, default None diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index a0fd992b3a532..ac42266b3c4eb 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -1669,10 +1669,10 @@ def test_to_latex(self): \end{tabular} """ self.assertEqual(withoutindex_result, withoutindex_expected) - + def test_to_latex_escape_special_chars(self): special_characters = ['&','%','$','#','_', - '{','}','~','^','\\'] + '{','}','~','^','\\'] df = DataFrame(data=special_characters) observed = df.to_latex() expected = r"""\begin{tabular}{ll} @@ -1694,6 +1694,99 @@ def test_to_latex_escape_special_chars(self): """ self.assertEqual(observed, expected) + def test_to_csv_quotechar(self): + df = DataFrame({'col' : [1,2]}) + expected = """\ +"","col" +"0","1" +"1","2" +""" + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1) # 1=QUOTE_ALL + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + expected = """\ +$$,$col$ +$0$,$1$ +$1$,$2$ +""" + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, quotechar="$") + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, quotechar="$", engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(TypeError, 'quotechar'): + df.to_csv(path, quoting=1, quotechar=None) + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(TypeError, 'quotechar'): + df.to_csv(path, quoting=1, quotechar=None, engine='python') + + def test_to_csv_doublequote(self): + df = DataFrame({'col' : ['a"a', '"bb"']}) + expected = '''\ +"","col" +"0","a""a" +"1","""bb""" +''' + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, doublequote=True, engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + from _csv import Error + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(Error, 'escapechar'): + df.to_csv(path, doublequote=False) # no escapechar set + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(Error, 'escapechar'): + df.to_csv(path, doublequote=False, engine='python') + + def test_to_csv_escapechar(self): + df = DataFrame({'col' : ['a"a', '"bb"']}) + expected = """\ +"","col" +"0","a\\"a" +"1","\\"bb\\"" +""" + with tm.ensure_clean('test.csv') as path: # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, doublequote=False, escapechar='\\', + engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + df = DataFrame({'col' : ['a,a', ',bb,']}) + expected = """\ +,col +0,a\\,a +1,\\,bb\\, +""" + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=3, escapechar='\\', engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + class TestSeriesFormatting(tm.TestCase): _multiprocess_can_split_ = True