Skip to content

ENH/BUG: pass formatting params thru to to_csv #5414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 17, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1036,20 +1036,29 @@ The Series and DataFrame objects have an instance method ``to_csv`` which
allows storing the contents of the object as a comma-separated-values file. The
function takes a number of arguments. Only the first is required.

- ``path``: A string path to the file to write
- ``path_or_buf``: A string path to the file to write or a StringIO
- ``sep`` : Field delimiter for the output file (default ",")
- ``na_rep``: A string representation of a missing value (default '')
- ``float_format``: Format string for floating point numbers
- ``cols``: Columns to write (default None)
- ``header``: Whether to write out the column names (default True)
- ``index``: whether to write row (index) names (default True)
- ``index_label``: Column label(s) for index column(s) if desired. If None
(default), and `header` and `index` are True, then the index names are
used. (A sequence should be given if the DataFrame uses MultiIndex).
- ``mode`` : Python write mode, default 'w'
- ``sep`` : Field delimiter for the output file (default ",")
- ``encoding``: a string representing the encoding to use if the contents are
non-ascii, for python versions prior to 3
- ``tupleize_cols``: boolean, default False, if False, write as a list of tuples,
otherwise write in an expanded line format suitable for ``read_csv``
- ``line_terminator``: Character sequence denoting line end (default '\\n')
- ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL)
- ``quotechar``: Character used to quote fields (default '"')
- ``doublequote``: Control quoting of ``quotechar`` in fields (default True)
- ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when
appropriate (default None)
- ``chunksize``: Number of rows to write at a time
- ``tupleize_cols``: If False (default), write as a list of tuples, otherwise
write in an expanded line format suitable for ``read_csv``
- ``date_format``: Format string for datetime objects

Writing a formatted string
~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
3 changes: 2 additions & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ Improvements to existing features
- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
- Testing statements updated to use specialized asserts (:issue: `6175`)
- ``Series.rank()`` now has a percentage rank option (:issue: `5971`)

- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)

.. _release.bug_fixes-0.14.0:

Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ Enhancements

household.join(portfolio, how='inner')

- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)

Performance
~~~~~~~~~~~

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -947,7 +947,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
cols=None, header=True, index=True, index_label=None,
mode='w', nanRep=None, encoding=None, quoting=None,
line_terminator='\n', chunksize=None, engine=None,
tupleize_cols=False, quotechar='"', date_format=None):
tupleize_cols=False, quotechar='"', date_format=None,
doublequote=True, escapechar=None):

self.engine = engine # remove for 0.13
self.obj = obj
Expand All @@ -972,6 +973,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
quotechar = None
self.quotechar = quotechar

self.doublequote = doublequote
self.escapechar = escapechar

self.line_terminator = line_terminator

self.date_format = date_format
Expand Down Expand Up @@ -1151,6 +1155,8 @@ def save(self):
try:
writer_kwargs = dict(lineterminator=self.line_terminator,
delimiter=self.sep, quoting=self.quoting,
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar)
if self.encoding is not None:
writer_kwargs['encoding'] = self.encoding
Expand Down
21 changes: 15 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,8 +1070,9 @@ def to_panel(self):
def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
cols=None, header=True, index=True, index_label=None,
mode='w', nanRep=None, encoding=None, quoting=None,
line_terminator='\n', chunksize=None,
tupleize_cols=False, date_format=None, **kwds):
quotechar='"', line_terminator='\n', chunksize=None,
tupleize_cols=False, date_format=None, doublequote=True,
escapechar=None, **kwds):
r"""Write DataFrame to a comma-separated values (csv) file

Parameters
Expand Down Expand Up @@ -1109,13 +1110,19 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
file
quoting : optional constant from csv module
defaults to csv.QUOTE_MINIMAL
quotechar : string (length 1), default '"'
character used to quote fields
doublequote : boolean, default True
Control quoting of `quotechar` inside a field
escapechar : string (length 1), default None
character used to escape `sep` and `quotechar` when appropriate
chunksize : int or None
rows to write at a time
tupleize_cols : boolean, default False
write multi_index columns as a list of tuples (if True)
or new (expanded format) if False)
date_format : string, default None
Format string for datetime objects.
Format string for datetime objects
"""
if nanRep is not None: # pragma: no cover
warnings.warn("nanRep is deprecated, use na_rep",
Expand All @@ -1129,10 +1136,12 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
float_format=float_format, cols=cols,
header=header, index=index,
index_label=index_label, mode=mode,
chunksize=chunksize, engine=kwds.get(
"engine"),
chunksize=chunksize, quotechar=quotechar,
engine=kwds.get("engine"),
tupleize_cols=tupleize_cols,
date_format=date_format)
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar)
formatter.save()

def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
Default (None) results in QUOTE_MINIMAL behavior.
skipinitialspace : boolean, default False
Skip spaces after delimiter
escapechar : string
escapechar : string (length 1), default None
One-character string used to escape delimiter when quoting is QUOTE_NONE.
dtype : Type name or dict of column -> type
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
compression : {'gzip', 'bz2', None}, default None
Expand Down
97 changes: 95 additions & 2 deletions pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1669,10 +1669,10 @@ def test_to_latex(self):
\end{tabular}
"""
self.assertEqual(withoutindex_result, withoutindex_expected)

def test_to_latex_escape_special_chars(self):
special_characters = ['&','%','$','#','_',
'{','}','~','^','\\']
'{','}','~','^','\\']
df = DataFrame(data=special_characters)
observed = df.to_latex()
expected = r"""\begin{tabular}{ll}
Expand All @@ -1694,6 +1694,99 @@ def test_to_latex_escape_special_chars(self):
"""
self.assertEqual(observed, expected)

def test_to_csv_quotechar(self):
df = DataFrame({'col' : [1,2]})
expected = """\
"","col"
"0","1"
"1","2"
"""
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, engine='python')
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)

expected = """\
$$,$col$
$0$,$1$
$1$,$2$
"""
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, quotechar="$")
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, quotechar="$", engine='python')
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)

with tm.ensure_clean('test.csv') as path:
with tm.assertRaisesRegexp(TypeError, 'quotechar'):
df.to_csv(path, quoting=1, quotechar=None)
with tm.ensure_clean('test.csv') as path:
with tm.assertRaisesRegexp(TypeError, 'quotechar'):
df.to_csv(path, quoting=1, quotechar=None, engine='python')

def test_to_csv_doublequote(self):
df = DataFrame({'col' : ['a"a', '"bb"']})
expected = '''\
"","col"
"0","a""a"
"1","""bb"""
'''
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, doublequote=True, engine='python')
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)

from _csv import Error
with tm.ensure_clean('test.csv') as path:
with tm.assertRaisesRegexp(Error, 'escapechar'):
df.to_csv(path, doublequote=False) # no escapechar set
with tm.ensure_clean('test.csv') as path:
with tm.assertRaisesRegexp(Error, 'escapechar'):
df.to_csv(path, doublequote=False, engine='python')

def test_to_csv_escapechar(self):
df = DataFrame({'col' : ['a"a', '"bb"']})
expected = """\
"","col"
"0","a\\"a"
"1","\\"bb\\""
"""
with tm.ensure_clean('test.csv') as path: # QUOTE_ALL
df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=1, doublequote=False, escapechar='\\',
engine='python')
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)

df = DataFrame({'col' : ['a,a', ',bb,']})
expected = """\
,col
0,a\\,a
1,\\,bb\\,
"""
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)
with tm.ensure_clean('test.csv') as path:
df.to_csv(path, quoting=3, escapechar='\\', engine='python')
with open(path, 'r') as f:
self.assertEqual(f.read(), expected)

class TestSeriesFormatting(tm.TestCase):
_multiprocess_can_split_ = True

Expand Down