Skip to content

Commit 973e076

Browse files
committed
Merge pull request #5414 from patricktokeeffe/pass-thru-to_csv-params
ENH/BUG: pass formatting params thru to `to_csv`
2 parents f716c21 + 791d441 commit 973e076

File tree

7 files changed

+137
-15
lines changed

7 files changed

+137
-15
lines changed

doc/source/io.rst

+13-4
Original file line numberDiff line numberDiff line change
@@ -1036,20 +1036,29 @@ The Series and DataFrame objects have an instance method ``to_csv`` which
10361036
allows storing the contents of the object as a comma-separated-values file. The
10371037
function takes a number of arguments. Only the first is required.
10381038

1039-
- ``path``: A string path to the file to write
1039+
- ``path_or_buf``: A string path to the file to write or a StringIO
1040+
- ``sep`` : Field delimiter for the output file (default ",")
10401041
- ``na_rep``: A string representation of a missing value (default '')
1042+
- ``float_format``: Format string for floating point numbers
10411043
- ``cols``: Columns to write (default None)
10421044
- ``header``: Whether to write out the column names (default True)
10431045
- ``index``: whether to write row (index) names (default True)
10441046
- ``index_label``: Column label(s) for index column(s) if desired. If None
10451047
(default), and `header` and `index` are True, then the index names are
10461048
used. (A sequence should be given if the DataFrame uses MultiIndex).
10471049
- ``mode`` : Python write mode, default 'w'
1048-
- ``sep`` : Field delimiter for the output file (default ",")
10491050
- ``encoding``: a string representing the encoding to use if the contents are
10501051
non-ascii, for python versions prior to 3
1051-
- ``tupleize_cols``: boolean, default False, if False, write as a list of tuples,
1052-
otherwise write in an expanded line format suitable for ``read_csv``
1052+
- ``line_terminator``: Character sequence denoting line end (default '\\n')
1053+
- ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL)
1054+
- ``quotechar``: Character used to quote fields (default '"')
1055+
- ``doublequote``: Control quoting of ``quotechar`` in fields (default True)
1056+
- ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when
1057+
appropriate (default None)
1058+
- ``chunksize``: Number of rows to write at a time
1059+
- ``tupleize_cols``: If False (default), write as a list of tuples, otherwise
1060+
write in an expanded line format suitable for ``read_csv``
1061+
- ``date_format``: Format string for datetime objects
10531062

10541063
Writing a formatted string
10551064
~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/release.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ Improvements to existing features
8686
- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`)
8787
- Testing statements updated to use specialized asserts (:issue: `6175`)
8888
- ``Series.rank()`` now has a percentage rank option (:issue: `5971`)
89-
89+
- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
90+
using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
9091

9192
.. _release.bug_fixes-0.14.0:
9293

doc/source/v0.14.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ Enhancements
179179

180180
household.join(portfolio, how='inner')
181181

182+
- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
183+
using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
184+
182185
Performance
183186
~~~~~~~~~~~
184187

pandas/core/format.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -947,7 +947,8 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
947947
cols=None, header=True, index=True, index_label=None,
948948
mode='w', nanRep=None, encoding=None, quoting=None,
949949
line_terminator='\n', chunksize=None, engine=None,
950-
tupleize_cols=False, quotechar='"', date_format=None):
950+
tupleize_cols=False, quotechar='"', date_format=None,
951+
doublequote=True, escapechar=None):
951952

952953
self.engine = engine # remove for 0.13
953954
self.obj = obj
@@ -972,6 +973,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
972973
quotechar = None
973974
self.quotechar = quotechar
974975

976+
self.doublequote = doublequote
977+
self.escapechar = escapechar
978+
975979
self.line_terminator = line_terminator
976980

977981
self.date_format = date_format
@@ -1151,6 +1155,8 @@ def save(self):
11511155
try:
11521156
writer_kwargs = dict(lineterminator=self.line_terminator,
11531157
delimiter=self.sep, quoting=self.quoting,
1158+
doublequote=self.doublequote,
1159+
escapechar=self.escapechar,
11541160
quotechar=self.quotechar)
11551161
if self.encoding is not None:
11561162
writer_kwargs['encoding'] = self.encoding

pandas/core/frame.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -1070,8 +1070,9 @@ def to_panel(self):
10701070
def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
10711071
cols=None, header=True, index=True, index_label=None,
10721072
mode='w', nanRep=None, encoding=None, quoting=None,
1073-
line_terminator='\n', chunksize=None,
1074-
tupleize_cols=False, date_format=None, **kwds):
1073+
quotechar='"', line_terminator='\n', chunksize=None,
1074+
tupleize_cols=False, date_format=None, doublequote=True,
1075+
escapechar=None, **kwds):
10751076
r"""Write DataFrame to a comma-separated values (csv) file
10761077
10771078
Parameters
@@ -1109,13 +1110,19 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
11091110
file
11101111
quoting : optional constant from csv module
11111112
defaults to csv.QUOTE_MINIMAL
1113+
quotechar : string (length 1), default '"'
1114+
character used to quote fields
1115+
doublequote : boolean, default True
1116+
Control quoting of `quotechar` inside a field
1117+
escapechar : string (length 1), default None
1118+
character used to escape `sep` and `quotechar` when appropriate
11121119
chunksize : int or None
11131120
rows to write at a time
11141121
tupleize_cols : boolean, default False
11151122
write multi_index columns as a list of tuples (if True)
11161123
or new (expanded format) if False)
11171124
date_format : string, default None
1118-
Format string for datetime objects.
1125+
Format string for datetime objects
11191126
"""
11201127
if nanRep is not None: # pragma: no cover
11211128
warnings.warn("nanRep is deprecated, use na_rep",
@@ -1129,10 +1136,12 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
11291136
float_format=float_format, cols=cols,
11301137
header=header, index=index,
11311138
index_label=index_label, mode=mode,
1132-
chunksize=chunksize, engine=kwds.get(
1133-
"engine"),
1139+
chunksize=chunksize, quotechar=quotechar,
1140+
engine=kwds.get("engine"),
11341141
tupleize_cols=tupleize_cols,
1135-
date_format=date_format)
1142+
date_format=date_format,
1143+
doublequote=doublequote,
1144+
escapechar=escapechar)
11361145
formatter.save()
11371146

11381147
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',

pandas/io/parsers.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@
4646
Default (None) results in QUOTE_MINIMAL behavior.
4747
skipinitialspace : boolean, default False
4848
Skip spaces after delimiter
49-
escapechar : string
49+
escapechar : string (length 1), default None
50+
One-character string used to escape delimiter when quoting is QUOTE_NONE.
5051
dtype : Type name or dict of column -> type
5152
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
5253
compression : {'gzip', 'bz2', None}, default None

pandas/tests/test_format.py

+95-2
Original file line numberDiff line numberDiff line change
@@ -1669,10 +1669,10 @@ def test_to_latex(self):
16691669
\end{tabular}
16701670
"""
16711671
self.assertEqual(withoutindex_result, withoutindex_expected)
1672-
1672+
16731673
def test_to_latex_escape_special_chars(self):
16741674
special_characters = ['&','%','$','#','_',
1675-
'{','}','~','^','\\']
1675+
'{','}','~','^','\\']
16761676
df = DataFrame(data=special_characters)
16771677
observed = df.to_latex()
16781678
expected = r"""\begin{tabular}{ll}
@@ -1694,6 +1694,99 @@ def test_to_latex_escape_special_chars(self):
16941694
"""
16951695
self.assertEqual(observed, expected)
16961696

1697+
def test_to_csv_quotechar(self):
1698+
df = DataFrame({'col' : [1,2]})
1699+
expected = """\
1700+
"","col"
1701+
"0","1"
1702+
"1","2"
1703+
"""
1704+
with tm.ensure_clean('test.csv') as path:
1705+
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
1706+
with open(path, 'r') as f:
1707+
self.assertEqual(f.read(), expected)
1708+
with tm.ensure_clean('test.csv') as path:
1709+
df.to_csv(path, quoting=1, engine='python')
1710+
with open(path, 'r') as f:
1711+
self.assertEqual(f.read(), expected)
1712+
1713+
expected = """\
1714+
$$,$col$
1715+
$0$,$1$
1716+
$1$,$2$
1717+
"""
1718+
with tm.ensure_clean('test.csv') as path:
1719+
df.to_csv(path, quoting=1, quotechar="$")
1720+
with open(path, 'r') as f:
1721+
self.assertEqual(f.read(), expected)
1722+
with tm.ensure_clean('test.csv') as path:
1723+
df.to_csv(path, quoting=1, quotechar="$", engine='python')
1724+
with open(path, 'r') as f:
1725+
self.assertEqual(f.read(), expected)
1726+
1727+
with tm.ensure_clean('test.csv') as path:
1728+
with tm.assertRaisesRegexp(TypeError, 'quotechar'):
1729+
df.to_csv(path, quoting=1, quotechar=None)
1730+
with tm.ensure_clean('test.csv') as path:
1731+
with tm.assertRaisesRegexp(TypeError, 'quotechar'):
1732+
df.to_csv(path, quoting=1, quotechar=None, engine='python')
1733+
1734+
def test_to_csv_doublequote(self):
1735+
df = DataFrame({'col' : ['a"a', '"bb"']})
1736+
expected = '''\
1737+
"","col"
1738+
"0","a""a"
1739+
"1","""bb"""
1740+
'''
1741+
with tm.ensure_clean('test.csv') as path:
1742+
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
1743+
with open(path, 'r') as f:
1744+
self.assertEqual(f.read(), expected)
1745+
with tm.ensure_clean('test.csv') as path:
1746+
df.to_csv(path, quoting=1, doublequote=True, engine='python')
1747+
with open(path, 'r') as f:
1748+
self.assertEqual(f.read(), expected)
1749+
1750+
from _csv import Error
1751+
with tm.ensure_clean('test.csv') as path:
1752+
with tm.assertRaisesRegexp(Error, 'escapechar'):
1753+
df.to_csv(path, doublequote=False) # no escapechar set
1754+
with tm.ensure_clean('test.csv') as path:
1755+
with tm.assertRaisesRegexp(Error, 'escapechar'):
1756+
df.to_csv(path, doublequote=False, engine='python')
1757+
1758+
def test_to_csv_escapechar(self):
1759+
df = DataFrame({'col' : ['a"a', '"bb"']})
1760+
expected = """\
1761+
"","col"
1762+
"0","a\\"a"
1763+
"1","\\"bb\\""
1764+
"""
1765+
with tm.ensure_clean('test.csv') as path: # QUOTE_ALL
1766+
df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
1767+
with open(path, 'r') as f:
1768+
self.assertEqual(f.read(), expected)
1769+
with tm.ensure_clean('test.csv') as path:
1770+
df.to_csv(path, quoting=1, doublequote=False, escapechar='\\',
1771+
engine='python')
1772+
with open(path, 'r') as f:
1773+
self.assertEqual(f.read(), expected)
1774+
1775+
df = DataFrame({'col' : ['a,a', ',bb,']})
1776+
expected = """\
1777+
,col
1778+
0,a\\,a
1779+
1,\\,bb\\,
1780+
"""
1781+
with tm.ensure_clean('test.csv') as path:
1782+
df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE
1783+
with open(path, 'r') as f:
1784+
self.assertEqual(f.read(), expected)
1785+
with tm.ensure_clean('test.csv') as path:
1786+
df.to_csv(path, quoting=3, escapechar='\\', engine='python')
1787+
with open(path, 'r') as f:
1788+
self.assertEqual(f.read(), expected)
1789+
16971790
class TestSeriesFormatting(tm.TestCase):
16981791
_multiprocess_can_split_ = True
16991792

0 commit comments

Comments
 (0)