From c679b9ffd61aec87f3113b08de47118e23e93691 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 00:46:36 +0000 Subject: [PATCH 01/11] base fixture and test for added function --- pandas/tests/io/conftest.py | 7 +++ .../io/parser/data/salaries_comments.csv | 50 +++++++++++++++++++ pandas/tests/io/test_common.py | 12 +++++ 3 files changed, 69 insertions(+) create mode 100644 pandas/tests/io/parser/data/salaries_comments.csv diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index b863e85cae457..b84a18538308f 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -36,6 +36,13 @@ def salaries_table(datapath): return read_csv(datapath("io", "parser", "data", "salaries.csv"), sep="\t") +@pytest.fixture +def salaries_table_comments(datapath): + return read_csv( + datapath("io", "parser", "data", "salaries_comments.csv"), sep="\t", comment="#" + ) + + @pytest.fixture def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") diff --git a/pandas/tests/io/parser/data/salaries_comments.csv b/pandas/tests/io/parser/data/salaries_comments.csv new file mode 100644 index 0000000000000..33e99f0e980f7 --- /dev/null +++ b/pandas/tests/io/parser/data/salaries_comments.csv @@ -0,0 +1,50 @@ +#line one +#line_two +#three lines_hello_world +S X E M +13876 1 1 1 +11608 1 3 0 +18701 1 3 1 +11283 1 2 0 +11767 1 3 0 +20872 2 2 1 +11772 2 2 0 +10535 2 1 0 +12195 2 3 0 +12313 3 2 0 +14975 3 1 1 +21371 3 2 1 +19800 3 3 1 +11417 4 1 0 +20263 4 3 1 +13231 4 3 0 +12884 4 2 0 +13245 5 2 0 +13677 5 3 0 +15965 5 1 1 +12336 6 1 0 +21352 6 3 1 +13839 6 2 0 +22884 6 2 1 +16978 7 1 1 +14803 8 2 0 +17404 8 1 1 +22184 8 3 1 +13548 8 1 0 +14467 10 1 0 +15942 10 2 0 +23174 10 3 1 +23780 10 2 1 +25410 11 2 1 +14861 11 1 0 +16882 12 2 0 +24170 12 3 1 +15990 13 1 0 +26330 13 2 1 +17949 14 2 0 +25685 15 3 1 +27837 16 2 1 +18838 16 2 0 +17483 16 1 0 +19207 17 2 0 +19346 20 1 0 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 435b9bdade944..0451a81e1df0c 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -620,3 +620,15 @@ def test_pickle_reader(reader): # GH 22265 with BytesIO() as buffer: pickle.dump(reader, buffer) + + +def test_comment_writer(salaries_table, salaries_table_comments): + tm.assert_frame_equal(salaries_table, salaries_table_comments) + with tm.ensure_clean() as path: + # salaries_table.to_csv(path, comment="#", comment_lines=comment_lines) + salaries_table.to_csv(path, sep="\t", index=False) + new_table = pd.read_csv(path, sep="\t") + print(salaries_table.head()) + print(new_table.head()) + # new_table = pd.read_csv(path, comment="#", sep="\t") + tm.assert_frame_equal(salaries_table, new_table) From 1e5747e9462cee17acc6d811834aa2ff2a3f9068 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 18:30:35 +0000 Subject: [PATCH 02/11] Framework for to_csv to function with comment writing --- pandas/core/generic.py | 3 +++ pandas/io/formats/csvs.py | 10 ++++++++++ pandas/io/formats/format.py | 4 ++++ pandas/tests/io/test_common.py | 25 +++++++++++++++++-------- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 91083f4018c06..d1b4367dccbcc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -15,6 +15,7 @@ Callable, ClassVar, Hashable, + Iterable, Iterator, Literal, Mapping, @@ -3662,6 +3663,8 @@ def to_csv( decimal: str = ".", errors: OpenFileErrors = "strict", storage_options: StorageOptions = None, + comment: str | None = None, + comment_lines: Iterable[str] | None = None, ) -> str | None: r""" Write object to a comma-separated values (csv) file. diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 672f7c1f71b15..5e400357cdafd 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -10,6 +10,7 @@ TYPE_CHECKING, Any, Hashable, + Iterable, Iterator, Sequence, cast, @@ -67,6 +68,8 @@ def __init__( doublequote: bool = True, escapechar: str | None = None, storage_options: StorageOptions = None, + comment: str | None = None, + comment_lines: Iterable[str] | None = None, ) -> None: self.fmt = formatter @@ -89,6 +92,8 @@ def __init__( self.date_format = date_format self.cols = self._initialize_columns(cols) self.chunksize = self._initialize_chunksize(chunksize) + self.comment = comment + self.comment_lines = comment_lines @property def na_rep(self) -> str: @@ -260,6 +265,8 @@ def save(self) -> None: self._save() def _save(self) -> None: + if self.comment_lines: + self._save_comment_lines() if self._need_to_save_header: self._save_header() self._save_body() @@ -318,3 +325,6 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: self.cols, self.writer, ) + + def _save_comments(self): + pass diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index a425944647b5c..96e40304ace39 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1117,6 +1117,8 @@ def to_csv( escapechar: str | None = None, errors: str = "strict", storage_options: StorageOptions = None, + comment: str | None = None, + comment_lines: Iterable[str] | None = None, ) -> str | None: """ Render dataframe as comma-separated file. @@ -1147,6 +1149,8 @@ def to_csv( escapechar=escapechar, storage_options=storage_options, formatter=self.fmt, + comment=comment, + comment_lines=comment_lines, ) csv_formatter.save() diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 0451a81e1df0c..4a51cff250a1c 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -3,6 +3,7 @@ """ import codecs import errno +import filecmp from functools import partial from io import ( BytesIO, @@ -622,13 +623,21 @@ def test_pickle_reader(reader): pickle.dump(reader, buffer) -def test_comment_writer(salaries_table, salaries_table_comments): +def test_comment_writer(salaries_table, salaries_table_comments, datapath): + comment = "#" + comment_lines = ["line one", "line_two", "three lines_hello_world"] tm.assert_frame_equal(salaries_table, salaries_table_comments) with tm.ensure_clean() as path: - # salaries_table.to_csv(path, comment="#", comment_lines=comment_lines) - salaries_table.to_csv(path, sep="\t", index=False) - new_table = pd.read_csv(path, sep="\t") - print(salaries_table.head()) - print(new_table.head()) - # new_table = pd.read_csv(path, comment="#", sep="\t") - tm.assert_frame_equal(salaries_table, new_table) + # Check commented table can be read and matches non-commented version + tm.assert_frame_equal(salaries_table, salaries_table_comments) + + # Write comments on uncommted table, validate + salaries_table.to_csv( + path, sep="\t", comment=comment, comment_lines=comment_lines + ) + + assert filecmp.cmp( + path, + datapath("io", "parser", "data", "salaries_comments.csv"), + shallow=False, + ), "Generated csv file with comments does not match expectation" From 9d8727b376868c1fe9040081a4159dcfe1dff64a Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 19:17:42 +0000 Subject: [PATCH 03/11] Feature added, comment lines can be written out to csv files --- pandas/core/generic.py | 6 ++ pandas/io/formats/csvs.py | 7 +- .../io/parser/data/salaries_comments.csv | 92 +++++++++---------- pandas/tests/io/test_common.py | 11 ++- testing-test.csv | 50 ++++++++++ testing.py | 10 ++ 6 files changed, 125 insertions(+), 51 deletions(-) create mode 100644 testing-test.csv create mode 100644 testing.py diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d1b4367dccbcc..a90407f96f7d4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3605,6 +3605,8 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + comment: str | None = ..., + comment_lines: Iterable[str] | None = ..., ) -> str: ... @@ -3632,6 +3634,8 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + comment: str | None = ..., + comment_lines: Iterable[str] | None = ..., ) -> None: ... @@ -3836,6 +3840,8 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, + comment=comment, + comment_lines=comment_lines, ) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5e400357cdafd..1bfb34f478047 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -265,7 +265,7 @@ def save(self) -> None: self._save() def _save(self) -> None: - if self.comment_lines: + if self.comment: self._save_comment_lines() if self._need_to_save_header: self._save_header() @@ -326,5 +326,6 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: self.writer, ) - def _save_comments(self): - pass + def _save_comment_lines(self): + for line in self.comment_lines: + self.writer.writerow([f"{self.comment}" + line]) diff --git a/pandas/tests/io/parser/data/salaries_comments.csv b/pandas/tests/io/parser/data/salaries_comments.csv index 33e99f0e980f7..3384ce64c4dd0 100644 --- a/pandas/tests/io/parser/data/salaries_comments.csv +++ b/pandas/tests/io/parser/data/salaries_comments.csv @@ -2,49 +2,49 @@ #line_two #three lines_hello_world S X E M -13876 1 1 1 -11608 1 3 0 -18701 1 3 1 -11283 1 2 0 -11767 1 3 0 -20872 2 2 1 -11772 2 2 0 -10535 2 1 0 -12195 2 3 0 -12313 3 2 0 -14975 3 1 1 -21371 3 2 1 -19800 3 3 1 -11417 4 1 0 -20263 4 3 1 -13231 4 3 0 -12884 4 2 0 -13245 5 2 0 -13677 5 3 0 -15965 5 1 1 -12336 6 1 0 -21352 6 3 1 -13839 6 2 0 -22884 6 2 1 -16978 7 1 1 -14803 8 2 0 -17404 8 1 1 -22184 8 3 1 -13548 8 1 0 -14467 10 1 0 -15942 10 2 0 -23174 10 3 1 -23780 10 2 1 -25410 11 2 1 -14861 11 1 0 -16882 12 2 0 -24170 12 3 1 -15990 13 1 0 -26330 13 2 1 -17949 14 2 0 -25685 15 3 1 -27837 16 2 1 -18838 16 2 0 -17483 16 1 0 -19207 17 2 0 -19346 20 1 0 +13876 1 1 1 +11608 1 3 0 +18701 1 3 1 +11283 1 2 0 +11767 1 3 0 +20872 2 2 1 +11772 2 2 0 +10535 2 1 0 +12195 2 3 0 +12313 3 2 0 +14975 3 1 1 +21371 3 2 1 +19800 3 3 1 +11417 4 1 0 +20263 4 3 1 +13231 4 3 0 +12884 4 2 0 +13245 5 2 0 +13677 5 3 0 +15965 5 1 1 +12336 6 1 0 +21352 6 3 1 +13839 6 2 0 +22884 6 2 1 +16978 7 1 1 +14803 8 2 0 +17404 8 1 1 +22184 8 3 1 +13548 8 1 0 +14467 10 1 0 +15942 10 2 0 +23174 10 3 1 +23780 10 2 1 +25410 11 2 1 +14861 11 1 0 +16882 12 2 0 +24170 12 3 1 +15990 13 1 0 +26330 13 2 1 +17949 14 2 0 +25685 15 3 1 +27837 16 2 1 +18838 16 2 0 +17483 16 1 0 +19207 17 2 0 +19346 20 1 0 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4a51cff250a1c..ec858d9ffc258 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -631,11 +631,18 @@ def test_comment_writer(salaries_table, salaries_table_comments, datapath): # Check commented table can be read and matches non-commented version tm.assert_frame_equal(salaries_table, salaries_table_comments) - # Write comments on uncommted table, validate + # Write comments on uncommented table, validate salaries_table.to_csv( - path, sep="\t", comment=comment, comment_lines=comment_lines + path, sep="\t", comment=comment, comment_lines=comment_lines, index=False ) + salaries_table.to_csv( + "testing-test.csv", + sep="\t", + comment=comment, + comment_lines=comment_lines, + index=False, + ) assert filecmp.cmp( path, datapath("io", "parser", "data", "salaries_comments.csv"), diff --git a/testing-test.csv b/testing-test.csv new file mode 100644 index 0000000000000..3384ce64c4dd0 --- /dev/null +++ b/testing-test.csv @@ -0,0 +1,50 @@ +#line one +#line_two +#three lines_hello_world +S X E M +13876 1 1 1 +11608 1 3 0 +18701 1 3 1 +11283 1 2 0 +11767 1 3 0 +20872 2 2 1 +11772 2 2 0 +10535 2 1 0 +12195 2 3 0 +12313 3 2 0 +14975 3 1 1 +21371 3 2 1 +19800 3 3 1 +11417 4 1 0 +20263 4 3 1 +13231 4 3 0 +12884 4 2 0 +13245 5 2 0 +13677 5 3 0 +15965 5 1 1 +12336 6 1 0 +21352 6 3 1 +13839 6 2 0 +22884 6 2 1 +16978 7 1 1 +14803 8 2 0 +17404 8 1 1 +22184 8 3 1 +13548 8 1 0 +14467 10 1 0 +15942 10 2 0 +23174 10 3 1 +23780 10 2 1 +25410 11 2 1 +14861 11 1 0 +16882 12 2 0 +24170 12 3 1 +15990 13 1 0 +26330 13 2 1 +17949 14 2 0 +25685 15 3 1 +27837 16 2 1 +18838 16 2 0 +17483 16 1 0 +19207 17 2 0 +19346 20 1 0 diff --git a/testing.py b/testing.py new file mode 100644 index 0000000000000..bc9a884f059f2 --- /dev/null +++ b/testing.py @@ -0,0 +1,10 @@ +import numpy as np + +import pandas as pd + +comment_lines = ["Hello", "world", "longer sentence here"] + +df = pd.DataFrame(np.random.rand(4, 4)) + +df.to_csv("testing.csv", index=False) +df.to_csv("testing.csv2", index=False, comment="#", comment_lines=comment_lines) From 85be77ea7ccfe4f73c747a97dadde13dd188635b Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 19:19:59 +0000 Subject: [PATCH 04/11] removed testing file --- testing-test.csv | 50 ------------------------------------------------ 1 file changed, 50 deletions(-) delete mode 100644 testing-test.csv diff --git a/testing-test.csv b/testing-test.csv deleted file mode 100644 index 3384ce64c4dd0..0000000000000 --- a/testing-test.csv +++ /dev/null @@ -1,50 +0,0 @@ -#line one -#line_two -#three lines_hello_world -S X E M -13876 1 1 1 -11608 1 3 0 -18701 1 3 1 -11283 1 2 0 -11767 1 3 0 -20872 2 2 1 -11772 2 2 0 -10535 2 1 0 -12195 2 3 0 -12313 3 2 0 -14975 3 1 1 -21371 3 2 1 -19800 3 3 1 -11417 4 1 0 -20263 4 3 1 -13231 4 3 0 -12884 4 2 0 -13245 5 2 0 -13677 5 3 0 -15965 5 1 1 -12336 6 1 0 -21352 6 3 1 -13839 6 2 0 -22884 6 2 1 -16978 7 1 1 -14803 8 2 0 -17404 8 1 1 -22184 8 3 1 -13548 8 1 0 -14467 10 1 0 -15942 10 2 0 -23174 10 3 1 -23780 10 2 1 -25410 11 2 1 -14861 11 1 0 -16882 12 2 0 -24170 12 3 1 -15990 13 1 0 -26330 13 2 1 -17949 14 2 0 -25685 15 3 1 -27837 16 2 1 -18838 16 2 0 -17483 16 1 0 -19207 17 2 0 -19346 20 1 0 From 4fb183abd2ac0668329586652e953cb3c6a8acf3 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 19:22:33 +0000 Subject: [PATCH 05/11] removed testing file --- testing.py | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 testing.py diff --git a/testing.py b/testing.py deleted file mode 100644 index bc9a884f059f2..0000000000000 --- a/testing.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy as np - -import pandas as pd - -comment_lines = ["Hello", "world", "longer sentence here"] - -df = pd.DataFrame(np.random.rand(4, 4)) - -df.to_csv("testing.csv", index=False) -df.to_csv("testing.csv2", index=False, comment="#", comment_lines=comment_lines) From 8dd124af809847aab9de98a0bab9db519f939207 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 19:24:17 +0000 Subject: [PATCH 06/11] cleaned up comments --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ec858d9ffc258..b8c61b79b1e8a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -631,7 +631,7 @@ def test_comment_writer(salaries_table, salaries_table_comments, datapath): # Check commented table can be read and matches non-commented version tm.assert_frame_equal(salaries_table, salaries_table_comments) - # Write comments on uncommented table, validate + # Write comments on uncommented table then validate salaries_table.to_csv( path, sep="\t", comment=comment, comment_lines=comment_lines, index=False ) From 8fbcfaf5653e52c2a6b905f0b98d8e2c90073791 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 19:34:01 +0000 Subject: [PATCH 07/11] modified docstring --- pandas/core/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a90407f96f7d4..6820670ae304a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3774,6 +3774,17 @@ def to_csv( .. versionadded:: 1.2.0 + comment: str, default None + Prefix which should be written to lines preceding the body + of an output csv. These lines can be used for comments or + metadata which are not part of the csv data itself. Complement + of pd.read_csv 'comment' param. + comment_lines: Iterable['str'], default None + Comment or metadata lines to write to the beginning of the csv + file. Each item is a row and will be prefixed with the character in + the 'comment' param + + Returns ------- None or str From acfa35d64e87be64d2e34a5a7fc1d464f80b6f20 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 19:58:47 +0000 Subject: [PATCH 08/11] prevent errors when comment is supplied, but comment_lines is None --- pandas/io/formats/csvs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 1bfb34f478047..726912c82c8f3 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -327,5 +327,6 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: ) def _save_comment_lines(self): - for line in self.comment_lines: - self.writer.writerow([f"{self.comment}" + line]) + if self.comment_lines: + for line in self.comment_lines: + self.writer.writerow([f"{self.comment}" + line]) From 6db73d855c0768bd7907a927eb77695bac623fd5 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 20:54:33 +0000 Subject: [PATCH 09/11] fixed missing return type hint --- pandas/io/formats/csvs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 726912c82c8f3..f1c05bbe76327 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -326,7 +326,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: self.writer, ) - def _save_comment_lines(self): + def _save_comment_lines(self) -> None: if self.comment_lines: for line in self.comment_lines: self.writer.writerow([f"{self.comment}" + line]) From 895d4f45953c49a018c4e50ae044752738feaaa5 Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Thu, 8 Jun 2023 21:09:00 +0000 Subject: [PATCH 10/11] fixed failing docstring ci tests --- pandas/core/generic.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6820670ae304a..b1e4e114c1833 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3774,16 +3774,15 @@ def to_csv( .. versionadded:: 1.2.0 - comment: str, default None + comment : str, default None Prefix which should be written to lines preceding the body of an output csv. These lines can be used for comments or metadata which are not part of the csv data itself. Complement of pd.read_csv 'comment' param. - comment_lines: Iterable['str'], default None + comment_lines : Iterable['str'], default None Comment or metadata lines to write to the beginning of the csv file. Each item is a row and will be prefixed with the character in - the 'comment' param - + the 'comment' param. Returns ------- From a4f13a55e422a482472cc9819a5274ba1fed109c Mon Sep 17 00:00:00 2001 From: Anthony Scott Date: Fri, 9 Jun 2023 14:50:19 +0000 Subject: [PATCH 11/11] removed file write call that was used for testing --- pandas/tests/io/test_common.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b8c61b79b1e8a..2136a0275dd00 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -636,13 +636,6 @@ def test_comment_writer(salaries_table, salaries_table_comments, datapath): path, sep="\t", comment=comment, comment_lines=comment_lines, index=False ) - salaries_table.to_csv( - "testing-test.csv", - sep="\t", - comment=comment, - comment_lines=comment_lines, - index=False, - ) assert filecmp.cmp( path, datapath("io", "parser", "data", "salaries_comments.csv"),