From 42233492225072b1b7fb16edcc348698da96c815 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 22:16:20 +0700 Subject: [PATCH 01/37] REF: extract properties cols and has_mi_columns --- pandas/io/formats/csvs.py | 88 ++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c462a96da7133..8b630e82a9af7 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -79,41 +79,67 @@ def __init__( self.index = index self.index_label = index_label self.mode = mode - if encoding is None: - encoding = "utf-8" - self.encoding = encoding + self.encoding = encoding or "utf-8" self.errors = errors self.compression = infer_compression(self.path_or_buf, compression) + self.quoting = quoting or csvlib.QUOTE_MINIMAL - if quoting is None: - quoting = csvlib.QUOTE_MINIMAL - self.quoting = quoting - - if quoting == csvlib.QUOTE_NONE: + if self.quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar - self.line_terminator = line_terminator or os.linesep - self.date_format = date_format + self.cols = cols + + # preallocate data 2d list + ncols = self.obj.shape[-1] + self.data = [None] * ncols + + if chunksize is None: + chunksize = (100000 // (len(self.cols) or 1)) or 1 + self.chunksize = int(chunksize) + + self.data_index = obj.index + if ( + isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and date_format is not None + ): + from pandas import Index + + self.data_index = Index( + [x.strftime(date_format) if notna(x) else "" for x in self.data_index] + ) + + self.nlevels = getattr(self.data_index, "nlevels", 1) + if not index: + self.nlevels = 0 + + @property + def has_mi_columns(self): + return isinstance(self.obj.columns, ABCMultiIndex) - self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) + @property + def cols(self): + return self._cols + @cols.setter + def cols(self, cols): # validate mi options if self.has_mi_columns: if cols is not None: - raise TypeError("cannot specify cols with a MultiIndex on the columns") + msg = "cannot specify cols with a MultiIndex on the columns" + raise TypeError(msg) if cols is not None: if isinstance(cols, ABCIndexClass): cols = cols.to_native_types( - na_rep=na_rep, - float_format=float_format, - date_format=date_format, + na_rep=self.na_rep, + float_format=self.float_format, + date_format=self.date_format, quoting=self.quoting, ) else: @@ -125,39 +151,15 @@ def __init__( cols = self.obj.columns if isinstance(cols, ABCIndexClass): cols = cols.to_native_types( - na_rep=na_rep, - float_format=float_format, - date_format=date_format, + na_rep=self.na_rep, + float_format=self.float_format, + date_format=self.date_format, quoting=self.quoting, ) else: cols = list(cols) - # save it - self.cols = cols - - # preallocate data 2d list - ncols = self.obj.shape[-1] - self.data = [None] * ncols - - if chunksize is None: - chunksize = (100000 // (len(self.cols) or 1)) or 1 - self.chunksize = int(chunksize) - - self.data_index = obj.index - if ( - isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) - and date_format is not None - ): - from pandas import Index - - self.data_index = Index( - [x.strftime(date_format) if notna(x) else "" for x in self.data_index] - ) - - self.nlevels = getattr(self.data_index, "nlevels", 1) - if not index: - self.nlevels = 0 + self._cols = cols def save(self) -> None: """ From 58ef28309f4c1c6b57eb35e6f8353e7d1c99f6c2 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 22:21:18 +0700 Subject: [PATCH 02/37] REF: extract property chunksize --- pandas/io/formats/csvs.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 8b630e82a9af7..a65181a47f996 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -99,9 +99,7 @@ def __init__( ncols = self.obj.shape[-1] self.data = [None] * ncols - if chunksize is None: - chunksize = (100000 // (len(self.cols) or 1)) or 1 - self.chunksize = int(chunksize) + self.chunksize = chunksize self.data_index = obj.index if ( @@ -161,6 +159,16 @@ def cols(self, cols): self._cols = cols + @property + def chunksize(self): + return self._chunksize + + @chunksize.setter + def chunksize(self, chunksize): + if chunksize is None: + chunksize = (100000 // (len(self.cols) or 1)) or 1 + self._chunksize = int(chunksize) + def save(self) -> None: """ Create the writer & save. From f4fe66d993fc97406f4bdd60687cde1c0866d7b2 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 22:24:09 +0700 Subject: [PATCH 03/37] REF: extract property quotechar --- pandas/io/formats/csvs.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index a65181a47f996..09cc334fe9513 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -83,12 +83,7 @@ def __init__( self.errors = errors self.compression = infer_compression(self.path_or_buf, compression) self.quoting = quoting or csvlib.QUOTE_MINIMAL - - if self.quoting == csvlib.QUOTE_NONE: - # prevents crash in _csv - quotechar = None self.quotechar = quotechar - self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep @@ -116,6 +111,16 @@ def __init__( if not index: self.nlevels = 0 + @property + def quotechar(self): + if self.quoting != csvlib.QUOTE_NONE: + # prevents crash in _csv + return self._quotechar + + @quotechar.setter + def quotechar(self, quotechar): + self._quotechar = quotechar + @property def has_mi_columns(self): return isinstance(self.obj.columns, ABCMultiIndex) From 59a2d2122474d347e3ccb1f2fe514ced98a3077c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 22:44:46 +0700 Subject: [PATCH 04/37] REF: extract properties data_index and nlevels --- pandas/io/formats/csvs.py | 42 +++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 09cc334fe9513..4faa479bb2be4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -89,28 +89,12 @@ def __init__( self.line_terminator = line_terminator or os.linesep self.date_format = date_format self.cols = cols + self.chunksize = chunksize # preallocate data 2d list ncols = self.obj.shape[-1] self.data = [None] * ncols - self.chunksize = chunksize - - self.data_index = obj.index - if ( - isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) - and date_format is not None - ): - from pandas import Index - - self.data_index = Index( - [x.strftime(date_format) if notna(x) else "" for x in self.data_index] - ) - - self.nlevels = getattr(self.data_index, "nlevels", 1) - if not index: - self.nlevels = 0 - @property def quotechar(self): if self.quoting != csvlib.QUOTE_NONE: @@ -174,6 +158,30 @@ def chunksize(self, chunksize): chunksize = (100000 // (len(self.cols) or 1)) or 1 self._chunksize = int(chunksize) + @property + def data_index(self): + data_index = self.obj.index + if ( + isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and self.date_format is not None + ): + from pandas import Index + + data_index = Index( + [ + x.strftime(self.date_format) if notna(x) else "" + for x in self.data_index + ] + ) + return data_index + + @property + def nlevels(self): + if self.index: + return getattr(self.data_index, "nlevels", 1) + else: + return 0 + def save(self) -> None: """ Create the writer & save. From 29256d496e99a9232c7f68453286904054a49e82 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 23:05:43 +0700 Subject: [PATCH 05/37] REF: refactor _save_chunk --- pandas/io/formats/csvs.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 4faa479bb2be4..72fed14045bae 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -340,17 +340,13 @@ def _save(self) -> None: self._save_chunk(start_i, end_i) def _save_chunk(self, start_i: int, end_i: int) -> None: - data_index = self.data_index - # create the data for a chunk slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] - blocks = df._mgr.blocks - for i in range(len(blocks)): - b = blocks[i] - d = b.to_native_types( + for block in df._mgr.blocks: + d = block.to_native_types( na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, @@ -358,11 +354,11 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: quoting=self.quoting, ) - for col_loc, col in zip(b.mgr_locs, d): + for col_loc, col in zip(block.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col - ix = data_index.to_native_types( + ix = self.data_index.to_native_types( slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, From a6e84e188d406f7c43d51c8accd0705ff7a02529 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 23:13:22 +0700 Subject: [PATCH 06/37] REF: refactor _save --- pandas/io/formats/csvs.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 72fed14045bae..eb03fe484fe30 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -327,16 +327,13 @@ def _save(self) -> None: nrows = len(self.data_index) - # write in chunksize bites - chunksize = self.chunksize - chunks = int(nrows / chunksize) + 1 + chunks = int(nrows / self.chunksize) + 1 for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, nrows) + start_i = i * self.chunksize + end_i = min(start_i + self.chunksize, nrows) if start_i >= end_i: break - self._save_chunk(start_i, end_i) def _save_chunk(self, start_i: int, end_i: int) -> None: From c840b3fe45384971088d09d81dfc4cdfee936f34 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 23:14:38 +0700 Subject: [PATCH 07/37] REF: extract method _save_body --- pandas/io/formats/csvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index eb03fe484fe30..c7199e0eaeb93 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -324,11 +324,11 @@ def _save_header(self): def _save(self) -> None: self._save_header() + self._save_body() + def _save_body(self) -> None: nrows = len(self.data_index) - chunks = int(nrows / self.chunksize) + 1 - for i in range(chunks): start_i = i * self.chunksize end_i = min(start_i + self.chunksize, nrows) From 682814618dd0ceaedb2a92dba7d98574bed130a3 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 23:15:50 +0700 Subject: [PATCH 08/37] REF: reorder _save-like methods --- pandas/io/formats/csvs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c7199e0eaeb93..8e28fda74e125 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -242,6 +242,10 @@ def save(self) -> None: for _fh in handles: _fh.close() + def _save(self) -> None: + self._save_header() + self._save_body() + def _save_header(self): writer = self.writer obj = self.obj @@ -322,10 +326,6 @@ def _save_header(self): encoded_labels.extend([""] * len(columns)) writer.writerow(encoded_labels) - def _save(self) -> None: - self._save_header() - self._save_body() - def _save_body(self) -> None: nrows = len(self.data_index) chunks = int(nrows / self.chunksize) + 1 From 98d4e471a19aa8cee90db545eebb6b0e11fb93cf Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Sep 2020 23:21:24 +0700 Subject: [PATCH 09/37] REF: extract compression property --- pandas/io/formats/csvs.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 8e28fda74e125..76d898e9b89c7 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -60,9 +60,6 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() - # Extract compression mode as given, if dict - compression, self.compression_args = get_compression_method(compression) - self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, @@ -81,7 +78,7 @@ def __init__( self.mode = mode self.encoding = encoding or "utf-8" self.errors = errors - self.compression = infer_compression(self.path_or_buf, compression) + self.compression = compression self.quoting = quoting or csvlib.QUOTE_MINIMAL self.quotechar = quotechar self.doublequote = doublequote @@ -182,13 +179,20 @@ def nlevels(self): else: return 0 - def save(self) -> None: - """ - Create the writer & save. - """ + @property + def compression(self): + return self._compression + + @compression.setter + def compression(self, compression): + # Extract compression mode as given, if dict + compression, self.compression_args = get_compression_method(compression) + + compression = infer_compression(self.path_or_buf, compression) + # GH21227 internal compression is not used for non-binary handles. if ( - self.compression + compression and hasattr(self.path_or_buf, "write") and "b" not in self.mode ): @@ -197,8 +201,14 @@ def save(self) -> None: RuntimeWarning, stacklevel=2, ) - self.compression = None + compression = None + + self._compression = compression + def save(self) -> None: + """ + Create the writer & save. + """ # get a handle or wrap an existing handle to take care of 1) compression and # 2) text -> byte conversion f, handles = get_handle( From d6b2827ebf4d92b871fc3b3d136e63285f23b1b8 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Sep 2020 00:01:54 +0700 Subject: [PATCH 10/37] REF: Extract property index_label --- pandas/io/formats/csvs.py | 52 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 76d898e9b89c7..bfcc6acfc961f 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -92,6 +92,32 @@ def __init__( ncols = self.obj.shape[-1] self.data = [None] * ncols + @property + def index_label(self): + return self._index_label + + @index_label.setter + def index_label(self, index_label): + if index_label is None: + self._index_label = self._get_index_label_from_obj() + elif not isinstance( + index_label, (list, tuple, np.ndarray, ABCIndexClass) + ): + # given a string for a DF with Index + self._index_label = [index_label] + + def _get_index_label_from_obj(self): + if isinstance(self.obj.index, ABCMultiIndex): + return self._get_index_label_multiindex() + return self._get_index_label_regular() + + def _get_index_label_multiindex(self): + return [name or "" for name in self.obj.index.names] + + def _get_index_label_regular(self): + index_label = self.obj.index.name + return [""] if index_label is None else [index_label] + @property def quotechar(self): if self.quoting != csvlib.QUOTE_NONE: @@ -268,6 +294,7 @@ def _save_header(self): has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) if not (has_aliases or self.header): return + if has_aliases: if len(header) != len(cols): raise ValueError( @@ -279,30 +306,7 @@ def _save_header(self): write_cols = cols if self.index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(obj.index, ABCMultiIndex): - index_label = [] - for i, name in enumerate(obj.index.names): - if name is None: - name = "" - index_label.append(name) - else: - index_label = obj.index.name - if index_label is None: - index_label = [""] - else: - index_label = [index_label] - elif not isinstance( - index_label, (list, tuple, np.ndarray, ABCIndexClass) - ): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] + encoded_labels = list(index_label) if not has_mi_columns or has_aliases: encoded_labels += list(write_cols) From 15dbc8333c9673b00387e92edcf9e95d9d6899e5 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Sep 2020 00:21:36 +0700 Subject: [PATCH 11/37] REF: extract helper properties --- pandas/io/formats/csvs.py | 62 ++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index bfcc6acfc961f..3f2aad3e075d2 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -231,6 +231,37 @@ def compression(self, compression): self._compression = compression + @property + def _has_aliases(self): + return isinstance(self.header, (tuple, list, np.ndarray, ABCIndexClass)) + + @property + def _need_to_save_header(self): + return self._has_aliases or self.header + + @property + def write_cols(self): + if self._has_aliases: + if len(self.header) != len(self.cols): + msg = f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" + raise ValueError(msg) + else: + return self.header + else: + return self.cols + + @property + def encoded_labels(self): + encoded_labels = [] + + if self.index: + encoded_labels = list(self.index_label) + + if not self.has_mi_columns or self._has_aliases: + encoded_labels += list(self.write_cols) + + return encoded_labels + def save(self) -> None: """ Create the writer & save. @@ -283,34 +314,18 @@ def _save(self) -> None: self._save_body() def _save_header(self): + if not self._need_to_save_header: + return + writer = self.writer obj = self.obj index_label = self.index_label cols = self.cols has_mi_columns = self.has_mi_columns header = self.header - encoded_labels: List[str] = [] - - has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) - if not (has_aliases or self.header): - return - - if has_aliases: - if len(header) != len(cols): - raise ValueError( - f"Writing {len(cols)} cols but got {len(header)} aliases" - ) - else: - write_cols = header - else: - write_cols = cols - - if self.index: - encoded_labels = list(index_label) - if not has_mi_columns or has_aliases: - encoded_labels += list(write_cols) - writer.writerow(encoded_labels) + if any(self.encoded_labels): + writer.writerow(self.encoded_labels) else: # write out the mi columns = obj.columns @@ -336,9 +351,8 @@ def _save_header(self): # Write out the index line if it's not empty. # Otherwise, we will print out an extraneous # blank line between the mi and the data rows. - if encoded_labels and set(encoded_labels) != {""}: - encoded_labels.extend([""] * len(columns)) - writer.writerow(encoded_labels) + if self.encoded_labels and set(self.encoded_labels) != {""}: + writer.writerow([""] * len(columns)) def _save_body(self) -> None: nrows = len(self.data_index) From 5e7b7789846d0066d78f329a39ba9381b7a1684c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Sep 2020 00:23:57 +0700 Subject: [PATCH 12/37] REF: delete local variables in _save_header --- pandas/io/formats/csvs.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 3f2aad3e075d2..171cf9470ec0f 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -317,18 +317,11 @@ def _save_header(self): if not self._need_to_save_header: return - writer = self.writer - obj = self.obj - index_label = self.index_label - cols = self.cols - has_mi_columns = self.has_mi_columns - header = self.header - if any(self.encoded_labels): - writer.writerow(self.encoded_labels) + self.writer.writerow(self.encoded_labels) else: # write out the mi - columns = obj.columns + columns = self.obj.columns # write out the names for each level, then ALL of the values for # each level @@ -341,18 +334,18 @@ def _save_header(self): # name is the first column col_line.append(columns.names[i]) - if isinstance(index_label, list) and len(index_label) > 1: - col_line.extend([""] * (len(index_label) - 1)) + if isinstance(self.index_label, list) and len(self.index_label) > 1: + col_line.extend([""] * (len(self.index_label) - 1)) col_line.extend(columns._get_level_values(i)) - writer.writerow(col_line) + self.writer.writerow(col_line) # Write out the index line if it's not empty. # Otherwise, we will print out an extraneous # blank line between the mi and the data rows. if self.encoded_labels and set(self.encoded_labels) != {""}: - writer.writerow([""] * len(columns)) + self.writer.writerow([""] * len(columns)) def _save_body(self) -> None: nrows = len(self.data_index) From 6e3b3891892b8cf53f992508f4335f9944357c7d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Sep 2020 00:28:21 +0700 Subject: [PATCH 13/37] REF: extract method _get_header_rows --- pandas/io/formats/csvs.py | 45 ++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 171cf9470ec0f..bbc19a461ceea 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -320,32 +320,39 @@ def _save_header(self): if any(self.encoded_labels): self.writer.writerow(self.encoded_labels) else: - # write out the mi - columns = self.obj.columns + for row in self._get_header_rows(): + self.writer.writerow(row) - # write out the names for each level, then ALL of the values for - # each level - for i in range(columns.nlevels): + def _get_header_rows(self): + rows = [] - # we need at least 1 index column to write our col names - col_line = [] - if self.index: + # write out the mi + columns = self.obj.columns - # name is the first column - col_line.append(columns.names[i]) + # write out the names for each level, then ALL of the values for + # each level + for i in range(columns.nlevels): - if isinstance(self.index_label, list) and len(self.index_label) > 1: - col_line.extend([""] * (len(self.index_label) - 1)) + # we need at least 1 index column to write our col names + col_line = [] + if self.index: - col_line.extend(columns._get_level_values(i)) + # name is the first column + col_line.append(columns.names[i]) - self.writer.writerow(col_line) + if isinstance(self.index_label, list) and len(self.index_label) > 1: + col_line.extend([""] * (len(self.index_label) - 1)) - # Write out the index line if it's not empty. - # Otherwise, we will print out an extraneous - # blank line between the mi and the data rows. - if self.encoded_labels and set(self.encoded_labels) != {""}: - self.writer.writerow([""] * len(columns)) + col_line.extend(columns._get_level_values(i)) + rows.append(col_line) + + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if self.encoded_labels and set(self.encoded_labels) != {""}: + rows.append([""] * len(columns)) + + return rows def _save_body(self) -> None: nrows = len(self.data_index) From d733f0f8b88d5c32ffce46188a6deb5bb20ce0d6 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Sep 2020 00:29:13 +0700 Subject: [PATCH 14/37] REF: move check for header into _save function --- pandas/io/formats/csvs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index bbc19a461ceea..3ce4b9183180d 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -310,13 +310,11 @@ def save(self) -> None: _fh.close() def _save(self) -> None: - self._save_header() + if self._need_to_save_header: + self._save_header() self._save_body() def _save_header(self): - if not self._need_to_save_header: - return - if any(self.encoded_labels): self.writer.writerow(self.encoded_labels) else: From cdeb115e3216fe79b6f79bbadde1866ccfabf405 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Sep 2020 00:41:36 +0700 Subject: [PATCH 15/37] TYP: add several type annotations --- pandas/io/formats/csvs.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 3ce4b9183180d..f48e61e2a1197 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -97,24 +97,22 @@ def index_label(self): return self._index_label @index_label.setter - def index_label(self, index_label): + def index_label(self, index_label) -> None: if index_label is None: self._index_label = self._get_index_label_from_obj() - elif not isinstance( - index_label, (list, tuple, np.ndarray, ABCIndexClass) - ): + elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): # given a string for a DF with Index self._index_label = [index_label] - def _get_index_label_from_obj(self): + def _get_index_label_from_obj(self) -> List[str]: if isinstance(self.obj.index, ABCMultiIndex): return self._get_index_label_multiindex() return self._get_index_label_regular() - def _get_index_label_multiindex(self): - return [name or "" for name in self.obj.index.names] + def _get_index_label_multiindex(self) -> List[str]: + return [name or "" for name in self.obj.index.names] - def _get_index_label_regular(self): + def _get_index_label_regular(self) -> List[str]: index_label = self.obj.index.name return [""] if index_label is None else [index_label] @@ -217,11 +215,7 @@ def compression(self, compression): compression = infer_compression(self.path_or_buf, compression) # GH21227 internal compression is not used for non-binary handles. - if ( - compression - and hasattr(self.path_or_buf, "write") - and "b" not in self.mode - ): + if compression and hasattr(self.path_or_buf, "write") and "b" not in self.mode: warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, @@ -243,16 +237,17 @@ def _need_to_save_header(self): def write_cols(self): if self._has_aliases: if len(self.header) != len(self.cols): - msg = f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" - raise ValueError(msg) + raise ValueError( + f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" + ) else: return self.header else: return self.cols @property - def encoded_labels(self): - encoded_labels = [] + def encoded_labels(self) -> List[str]: + encoded_labels: List[str] = [] if self.index: encoded_labels = list(self.index_label) @@ -314,14 +309,14 @@ def _save(self) -> None: self._save_header() self._save_body() - def _save_header(self): + def _save_header(self) -> None: if any(self.encoded_labels): self.writer.writerow(self.encoded_labels) else: for row in self._get_header_rows(): self.writer.writerow(row) - def _get_header_rows(self): + def _get_header_rows(self) -> List[List[str]]: rows = [] # write out the mi From 417e74a3d57ce2b0af2c67dce6140af1767a096b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Sep 2020 12:52:03 +0700 Subject: [PATCH 16/37] FIX: fix index labels --- pandas/io/formats/csvs.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index f48e61e2a1197..d5b46a93a965a 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -98,16 +98,20 @@ def index_label(self): @index_label.setter def index_label(self, index_label) -> None: - if index_label is None: - self._index_label = self._get_index_label_from_obj() - elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): - # given a string for a DF with Index - self._index_label = [index_label] + # should write something for index label + if index_label is not False: + if index_label is None: + index_label = self._get_index_label_from_obj() + elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): + # given a string for a DF with Index + index_label = [index_label] + self._index_label = index_label def _get_index_label_from_obj(self) -> List[str]: if isinstance(self.obj.index, ABCMultiIndex): return self._get_index_label_multiindex() - return self._get_index_label_regular() + else: + return self._get_index_label_regular() def _get_index_label_multiindex(self) -> List[str]: return [name or "" for name in self.obj.index.names] @@ -189,10 +193,7 @@ def data_index(self): from pandas import Index data_index = Index( - [ - x.strftime(self.date_format) if notna(x) else "" - for x in self.data_index - ] + [x.strftime(self.date_format) if notna(x) else "" for x in data_index] ) return data_index @@ -249,7 +250,7 @@ def write_cols(self): def encoded_labels(self) -> List[str]: encoded_labels: List[str] = [] - if self.index: + if self.index and self.index_label: encoded_labels = list(self.index_label) if not self.has_mi_columns or self._has_aliases: From 9df1d825f203eb8aeb4a9fd260503f99af3fd3c7 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 4 Sep 2020 15:08:21 +0000 Subject: [PATCH 17/37] FIX: fix multiindex --- pandas/io/formats/csvs.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d5b46a93a965a..84b1a03ab4902 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -311,26 +311,18 @@ def _save(self) -> None: self._save_body() def _save_header(self) -> None: - if any(self.encoded_labels): + if not self.has_mi_columns or self._has_aliases: self.writer.writerow(self.encoded_labels) else: - for row in self._get_header_rows(): + for row in self._generate_multiindex_header_rows(): self.writer.writerow(row) - def _get_header_rows(self) -> List[List[str]]: - rows = [] - - # write out the mi + def _generate_multiindex_header_rows(self): columns = self.obj.columns - - # write out the names for each level, then ALL of the values for - # each level for i in range(columns.nlevels): - # we need at least 1 index column to write our col names col_line = [] if self.index: - # name is the first column col_line.append(columns.names[i]) @@ -338,15 +330,13 @@ def _get_header_rows(self) -> List[List[str]]: col_line.extend([""] * (len(self.index_label) - 1)) col_line.extend(columns._get_level_values(i)) - rows.append(col_line) + yield col_line # Write out the index line if it's not empty. # Otherwise, we will print out an extraneous # blank line between the mi and the data rows. if self.encoded_labels and set(self.encoded_labels) != {""}: - rows.append([""] * len(columns)) - - return rows + yield self.encoded_labels + [""] * len(columns) def _save_body(self) -> None: nrows = len(self.data_index) From 22955dbd9c317bab43e1c2deb426dc90be3956cd Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 00:07:44 +0700 Subject: [PATCH 18/37] FIX: fix test failures on compression Needed to eliminate compression setter due to the interdependencies between ioargs and compression. --- pandas/io/formats/csvs.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0ad176b3dd346..d5e50f8b40f02 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -56,6 +56,7 @@ def __init__( storage_options: StorageOptions = None, ): self.obj = obj + self.encoding = encoding or "utf-8" if path_or_buf is None: path_or_buf = StringIO() @@ -66,7 +67,7 @@ def __init__( ioargs = get_filepath_or_buffer( path_or_buf, - encoding=encoding, + encoding=self.encoding, compression=self.compression, mode=mode, storage_options=storage_options, @@ -76,13 +77,17 @@ def __init__( self.mode = ioargs.mode # GH21227 internal compression is not used for non-binary handles. - if compression and hasattr(self.path_or_buf, "write") and "b" not in self.mode: + if ( + self.compression + and hasattr(self.path_or_buf, "write") + and "b" not in self.mode + ): warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, stacklevel=2, ) - compression = None + self.compression = None self.sep = sep self.na_rep = na_rep @@ -91,7 +96,6 @@ def __init__( self.header = header self.index = index self.index_label = index_label - self.encoding = encoding or "utf-8" self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL self.quotechar = quotechar From 5dcff8e58af99e08ac081e7a35dabfffc34187e7 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 02:54:03 +0700 Subject: [PATCH 19/37] REF: eliminate preallocation of self.data --- pandas/io/formats/csvs.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d5e50f8b40f02..096fe22e3e2e4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -106,10 +106,6 @@ def __init__( self.cols = cols self.chunksize = chunksize - # preallocate data 2d list - ncols = self.obj.shape[-1] - self.data = [None] * ncols - @property def index_label(self): return self._index_label @@ -345,6 +341,9 @@ def _save_body(self) -> None: self._save_chunk(start_i, end_i) def _save_chunk(self, start_i: int, end_i: int) -> None: + ncols = self.obj.shape[-1] + data = [None] * ncols + # create the data for a chunk slicer = slice(start_i, end_i) @@ -360,8 +359,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: ) for col_loc, col in zip(block.mgr_locs, d): - # self.data is a preallocated list - self.data[col_loc] = col + data[col_loc] = col ix = self.data_index.to_native_types( slicer=slicer, @@ -372,4 +370,4 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: quoting=self.quoting, ) - libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) From ff144d87bcf3e246e4514422d4684f785123f606 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 03:23:14 +0700 Subject: [PATCH 20/37] REF: extract method _convert_to_native_types --- pandas/io/formats/csvs.py | 42 +++++++++++++-------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 096fe22e3e2e4..744339b77c167 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -162,12 +162,7 @@ def cols(self, cols): if cols is not None: if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types( - na_rep=self.na_rep, - float_format=self.float_format, - date_format=self.date_format, - quoting=self.quoting, - ) + cols = self._convert_to_native_types(cols) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -176,17 +171,22 @@ def cols(self, cols): # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types( - na_rep=self.na_rep, - float_format=self.float_format, - date_format=self.date_format, - quoting=self.quoting, - ) + cols = self._convert_to_native_types(cols) else: cols = list(cols) self._cols = cols + def _convert_to_native_types(self, arg, **kwargs): + return arg.to_native_types( + na_rep=self.na_rep, + float_format=self.float_format, + date_format=self.date_format, + quoting=self.quoting, + decimal=self.decimal, + **kwargs, + ) + @property def chunksize(self): return self._chunksize @@ -350,24 +350,10 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: df = self.obj.iloc[slicer] for block in df._mgr.blocks: - d = block.to_native_types( - na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting, - ) + d = self._convert_to_native_types(block) for col_loc, col in zip(block.mgr_locs, d): data[col_loc] = col - ix = self.data_index.to_native_types( - slicer=slicer, - na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting, - ) - + ix = self._convert_to_native_types(self.data_index, slicer=slicer) libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) From 3da7207bb77ff035bfbfa2c80b99365af0fff80d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 16:13:39 +0700 Subject: [PATCH 21/37] REF: rename regular -> flat as reviewed --- pandas/io/formats/csvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 744339b77c167..66f9843562f6c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -125,12 +125,12 @@ def _get_index_label_from_obj(self) -> List[str]: if isinstance(self.obj.index, ABCMultiIndex): return self._get_index_label_multiindex() else: - return self._get_index_label_regular() + return self._get_index_label_flat() def _get_index_label_multiindex(self) -> List[str]: return [name or "" for name in self.obj.index.names] - def _get_index_label_regular(self) -> List[str]: + def _get_index_label_flat(self) -> List[str]: index_label = self.obj.index.name return [""] if index_label is None else [index_label] From 6041666d0e9afa3988d22bb52609be971594b04d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 16:27:13 +0700 Subject: [PATCH 22/37] TYP: add type annotations as reviewed --- pandas/io/formats/csvs.py | 42 +++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 66f9843562f6c..f57d29621088e 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Hashable, List, Optional, Sequence, Union +from typing import Generator, Hashable, List, Optional, Sequence, Union import warnings import numpy as np @@ -21,6 +21,8 @@ ) from pandas.core.dtypes.missing import notna +from pandas.core.indexes.api import Index + from pandas.io.common import ( get_compression_method, get_filepath_or_buffer, @@ -28,6 +30,8 @@ infer_compression, ) +IndexLabel = Optional[Union[bool, str, Sequence[Hashable]]] + class CSVFormatter: def __init__( @@ -40,7 +44,7 @@ def __init__( cols=None, header: Union[bool, Sequence[Hashable]] = True, index: bool = True, - index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, + index_label: IndexLabel = None, mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", @@ -107,30 +111,32 @@ def __init__( self.chunksize = chunksize @property - def index_label(self): + def index_label(self) -> IndexLabel: return self._index_label @index_label.setter - def index_label(self, index_label) -> None: - # should write something for index label - if index_label is not False: - if index_label is None: - index_label = self._get_index_label_from_obj() - elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): - # given a string for a DF with Index - index_label = [index_label] + def index_label(self, index_label: IndexLabel) -> None: + if index_label is False: + self._index_label = index_label + return + + if index_label is None: + index_label = self._get_index_label_from_obj() + elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): + # given a string for a DF with Index + index_label = [index_label] self._index_label = index_label - def _get_index_label_from_obj(self) -> List[str]: + def _get_index_label_from_obj(self): if isinstance(self.obj.index, ABCMultiIndex): return self._get_index_label_multiindex() else: return self._get_index_label_flat() - def _get_index_label_multiindex(self) -> List[str]: + def _get_index_label_multiindex(self): return [name or "" for name in self.obj.index.names] - def _get_index_label_flat(self) -> List[str]: + def _get_index_label_flat(self): index_label = self.obj.index.name return [""] if index_label is None else [index_label] @@ -198,14 +204,12 @@ def chunksize(self, chunksize): self._chunksize = int(chunksize) @property - def data_index(self): + def data_index(self) -> Index: data_index = self.obj.index if ( isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and self.date_format is not None ): - from pandas import Index - data_index = Index( [x.strftime(self.date_format) if notna(x) else "" for x in data_index] ) @@ -239,7 +243,7 @@ def write_cols(self): return self.cols @property - def encoded_labels(self) -> List[str]: + def encoded_labels(self): encoded_labels: List[str] = [] if self.index and self.index_label: @@ -309,7 +313,7 @@ def _save_header(self) -> None: for row in self._generate_multiindex_header_rows(): self.writer.writerow(row) - def _generate_multiindex_header_rows(self): + def _generate_multiindex_header_rows(self) -> Generator[List[str], None, None]: columns = self.obj.columns for i in range(columns.nlevels): # we need at least 1 index column to write our col names From 46f593d87f04ca6ff5d9edd0f949df437b586c25 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 16:39:19 +0700 Subject: [PATCH 23/37] REF: refactor number formatting Replace _convert_to_native_types method in favor of a number formatting dictionary. --- pandas/io/formats/csvs.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index f57d29621088e..a09ecb2d1974c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -168,7 +168,7 @@ def cols(self, cols): if cols is not None: if isinstance(cols, ABCIndexClass): - cols = self._convert_to_native_types(cols) + cols = cols.to_native_types(**self._number_format) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -177,20 +177,22 @@ def cols(self, cols): # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): - cols = self._convert_to_native_types(cols) + cols = cols.to_native_types(**self._number_format) + else: cols = list(cols) self._cols = cols - def _convert_to_native_types(self, arg, **kwargs): - return arg.to_native_types( + @property + def _number_format(self) -> dict: + """Dictionary used for storing number formatting settings.""" + return dict( na_rep=self.na_rep, float_format=self.float_format, date_format=self.date_format, quoting=self.quoting, decimal=self.decimal, - **kwargs, ) @property @@ -354,10 +356,10 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: df = self.obj.iloc[slicer] for block in df._mgr.blocks: - d = self._convert_to_native_types(block) + d = block.to_native_types(**self._number_format) for col_loc, col in zip(block.mgr_locs, d): data[col_loc] = col - ix = self._convert_to_native_types(self.data_index, slicer=slicer) + ix = self.data_index.to_native_types(slicer=slicer, **self._number_format) libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) From 080e6e1a03244fd344d30e45c386ea098f7a0a95 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 19:30:13 +0700 Subject: [PATCH 24/37] FIX: mypy error with index_label --- pandas/io/formats/csvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index a09ecb2d1974c..ac31841d5aa88 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,7 +11,7 @@ import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -30,7 +30,7 @@ infer_compression, ) -IndexLabel = Optional[Union[bool, str, Sequence[Hashable]]] +IndexLabel = Optional[Union[bool, str, Sequence[Label]]] class CSVFormatter: From 1e35f87c4a916d6cd8b71d1deb2bfeed30f24bf7 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 19:58:17 +0700 Subject: [PATCH 25/37] FIX: reorder if-statements in index_label To make sure that the newer mypy (v0.782) passes. --- pandas/io/formats/csvs.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index ac31841d5aa88..c6e80d4f46169 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -116,15 +116,12 @@ def index_label(self) -> IndexLabel: @index_label.setter def index_label(self, index_label: IndexLabel) -> None: - if index_label is False: - self._index_label = index_label - return - - if index_label is None: - index_label = self._get_index_label_from_obj() - elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): - # given a string for a DF with Index - index_label = [index_label] + if index_label is not False: + if index_label is None: + index_label = self._get_index_label_from_obj() + elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): + # given a string for a DF with Index + index_label = [index_label] self._index_label = index_label def _get_index_label_from_obj(self): From ba353a56fe6eac817200d89271492c42979c6cb9 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 20:09:39 +0700 Subject: [PATCH 26/37] TYP: move IndexLabel to pandas._typing This eliminates repetition of the type annotations for index label in multiple places. --- pandas/_typing.py | 2 ++ pandas/core/generic.py | 3 ++- pandas/io/formats/csvs.py | 9 ++++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index f8af92e07c674..468edcfbd4181 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -15,6 +15,7 @@ List, Mapping, Optional, + Sequence, Type, TypeVar, Union, @@ -82,6 +83,7 @@ Axis = Union[str, int] Label = Optional[Hashable] +IndexLabel = Optional[Union[bool, str, Sequence[Label]]] Level = Union[Label, int] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c8780a0fc186..078d489991ff6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -40,6 +40,7 @@ CompressionOptions, FilePathOrBuffer, FrameOrSeries, + IndexLabel, JSONSerializable, Label, Level, @@ -3160,7 +3161,7 @@ def to_csv( columns: Optional[Sequence[Label]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, + index_label: IndexLabel = None, mode: str = "w", encoding: Optional[str] = None, compression: CompressionOptions = "infer", diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c6e80d4f46169..64a612e881548 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,7 +11,12 @@ import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + IndexLabel, + StorageOptions, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -30,8 +35,6 @@ infer_compression, ) -IndexLabel = Optional[Union[bool, str, Sequence[Label]]] - class CSVFormatter: def __init__( From a49dd63b7a1abd9e7c02860f54285c839c6ea17c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 21:56:37 +0700 Subject: [PATCH 27/37] TYP: quotechar, has_mi_columns, _need_to_save... --- pandas/io/formats/csvs.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 64a612e881548..a062a378a4b04 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -15,6 +15,7 @@ CompressionOptions, FilePathOrBuffer, IndexLabel, + Label, StorageOptions, ) @@ -44,7 +45,7 @@ def __init__( sep: str = ",", na_rep: str = "", float_format: Optional[str] = None, - cols=None, + cols: Optional[Sequence[Label]] = None, header: Union[bool, Sequence[Hashable]] = True, index: bool = True, index_label: IndexLabel = None, @@ -55,7 +56,7 @@ def __init__( quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, - quotechar='"', + quotechar: Optional[str] = '"', date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, @@ -141,18 +142,19 @@ def _get_index_label_flat(self): return [""] if index_label is None else [index_label] @property - def quotechar(self): + def quotechar(self) -> Optional[str]: if self.quoting != csvlib.QUOTE_NONE: # prevents crash in _csv return self._quotechar + return None @quotechar.setter - def quotechar(self, quotechar): + def quotechar(self, quotechar: Optional[str]) -> None: self._quotechar = quotechar @property - def has_mi_columns(self): - return isinstance(self.obj.columns, ABCMultiIndex) + def has_mi_columns(self) -> bool: + return bool(isinstance(self.obj.columns, ABCMultiIndex)) @property def cols(self): @@ -178,7 +180,6 @@ def cols(self, cols): cols = self.obj.columns if isinstance(cols, ABCIndexClass): cols = cols.to_native_types(**self._number_format) - else: cols = list(cols) @@ -229,8 +230,8 @@ def _has_aliases(self): return isinstance(self.header, (tuple, list, np.ndarray, ABCIndexClass)) @property - def _need_to_save_header(self): - return self._has_aliases or self.header + def _need_to_save_header(self) -> bool: + return bool(self._has_aliases or self.header) @property def write_cols(self): From f1e1ac86cb33944f3938dbc5d59ea5ce13beecea Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 22:02:05 +0700 Subject: [PATCH 28/37] TYP: chunksize, but ignored assignment check For some reason mypy would not recognize that chunksize turns from Optional[int] to int inside the setter. Even setting an intentional assertion ``assert chunksize is not None`` does not help. --- pandas/io/formats/csvs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index a062a378a4b04..14df5bb40dcc1 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -112,7 +112,7 @@ def __init__( self.line_terminator = line_terminator or os.linesep self.date_format = date_format self.cols = cols - self.chunksize = chunksize + self.chunksize = chunksize # type: ignore[assignment] @property def index_label(self) -> IndexLabel: @@ -197,13 +197,14 @@ def _number_format(self) -> dict: ) @property - def chunksize(self): + def chunksize(self) -> int: return self._chunksize @chunksize.setter - def chunksize(self, chunksize): + def chunksize(self, chunksize: Optional[int]) -> None: if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 + assert chunksize is not None self._chunksize = int(chunksize) @property From 134699566995b012ed75c5ccebc8abb2bbf9047b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 22:16:02 +0700 Subject: [PATCH 29/37] TYP: cols property Limitations: - ignore type[assignment] error. - Created additional method _refine_cols to allow conversion from Optional[Sequence[Label]] to Sequence[Label]. --- pandas/io/formats/csvs.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 14df5bb40dcc1..b30e555c8812f 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -111,7 +111,7 @@ def __init__( self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format - self.cols = cols + self.cols = cols # type: ignore[assignment] self.chunksize = chunksize # type: ignore[assignment] @property @@ -157,11 +157,14 @@ def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) @property - def cols(self): + def cols(self) -> Sequence[Label]: return self._cols @cols.setter - def cols(self, cols): + def cols(self, cols: Optional[Sequence[Label]]) -> None: + self._cols = self._refine_cols(cols) + + def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # validate mi options if self.has_mi_columns: if cols is not None: @@ -179,11 +182,10 @@ def cols(self, cols): # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types(**self._number_format) + return cols.to_native_types(**self._number_format) else: - cols = list(cols) - - self._cols = cols + assert isinstance(cols, Sequence) + return list(cols) @property def _number_format(self) -> dict: From bebdfcf1aa8e76a7cd69ce111fcdb96921530860 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 5 Sep 2020 22:32:59 +0700 Subject: [PATCH 30/37] TYP: nlevels and _has_aliases --- pandas/io/formats/csvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index b30e555c8812f..2373c3f7785e2 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -222,14 +222,14 @@ def data_index(self) -> Index: return data_index @property - def nlevels(self): + def nlevels(self) -> int: if self.index: return getattr(self.data_index, "nlevels", 1) else: return 0 @property - def _has_aliases(self): + def _has_aliases(self) -> bool: return isinstance(self.header, (tuple, list, np.ndarray, ABCIndexClass)) @property From ca888c1064ada48def5608b76df5d713c067b2c5 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 6 Sep 2020 00:21:16 +0700 Subject: [PATCH 31/37] CLN: move GH21227 check to pandas/io/common.py --- pandas/io/common.py | 15 +++++++++++++++ pandas/io/formats/csvs.py | 16 ++-------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index a80b89569f429..007e0dcbbcfe1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -208,6 +208,21 @@ def get_filepath_or_buffer( # handle compression dict compression_method, compression = get_compression_method(compression) compression_method = infer_compression(filepath_or_buffer, compression_method) + + # GH21227 internal compression is not used for non-binary handles. + if ( + compression_method + and hasattr(filepath_or_buffer, "write") + and mode + and "b" not in mode + ): + warnings.warn( + "compression has no effect when passing a non-binary object as input.", + RuntimeWarning, + stacklevel=2, + ) + compression_method = None + compression = dict(compression, method=compression_method) # bz2 and xz do not write the byte order mark for utf-16 and utf-32 diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c35caf5bba1e3..499f73c27406f 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -6,7 +6,6 @@ from io import StringIO, TextIOWrapper import os from typing import Generator, Hashable, List, Optional, Sequence, Union -import warnings import numpy as np @@ -59,6 +58,7 @@ def __init__( storage_options: StorageOptions = None, ): self.obj = obj + self.encoding = encoding or "utf-8" if path_or_buf is None: @@ -71,25 +71,13 @@ def __init__( mode=mode, storage_options=storage_options, ) + self.compression = ioargs.compression.pop("method") self.compression_args = ioargs.compression self.path_or_buf = ioargs.filepath_or_buffer self.should_close = ioargs.should_close self.mode = ioargs.mode - # GH21227 internal compression is not used for non-binary handles. - if ( - self.compression - and hasattr(self.path_or_buf, "write") - and "b" not in self.mode - ): - warnings.warn( - "compression has no effect when passing a non-binary object as input.", - RuntimeWarning, - stacklevel=2, - ) - self.compression = None - self.sep = sep self.na_rep = na_rep self.float_format = float_format From b7dae1124b466fd0eece6c94c5af1e74a4958d4e Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 6 Sep 2020 01:58:40 +0700 Subject: [PATCH 32/37] TYP: remove redundant bool from IndexLabel type --- pandas/_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 0f46d05afab08..7aef5c02e290f 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -83,7 +83,7 @@ Axis = Union[str, int] Label = Optional[Hashable] -IndexLabel = Optional[Union[bool, str, Sequence[Label]]] +IndexLabel = Optional[Union[Label, Sequence[Label]]] Level = Union[Label, int] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] From 1f8c488151249ab70945828394d03892903f922e Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 6 Sep 2020 01:59:25 +0700 Subject: [PATCH 33/37] TYP: add to _get_index_label... methods --- pandas/io/formats/csvs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 499f73c27406f..b43727464073e 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -109,16 +109,16 @@ def index_label(self, index_label: IndexLabel) -> None: index_label = [index_label] self._index_label = index_label - def _get_index_label_from_obj(self): + def _get_index_label_from_obj(self) -> Sequence[str]: if isinstance(self.obj.index, ABCMultiIndex): return self._get_index_label_multiindex() else: return self._get_index_label_flat() - def _get_index_label_multiindex(self): + def _get_index_label_multiindex(self) -> Sequence[str]: return [name or "" for name in self.obj.index.names] - def _get_index_label_flat(self): + def _get_index_label_flat(self) -> Sequence[str]: index_label = self.obj.index.name return [""] if index_label is None else [index_label] From 1a750b4ffb79d8dd6d6b10d500a678fdc8275811 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sun, 6 Sep 2020 02:00:53 +0700 Subject: [PATCH 34/37] TYP: use Iterator instead of Generator --- pandas/io/formats/csvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index b43727464073e..e631ab2cb2301 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Generator, Hashable, List, Optional, Sequence, Union +from typing import Hashable, Iterator, List, Optional, Sequence, Union import numpy as np @@ -300,7 +300,7 @@ def _save_header(self) -> None: for row in self._generate_multiindex_header_rows(): self.writer.writerow(row) - def _generate_multiindex_header_rows(self) -> Generator[List[str], None, None]: + def _generate_multiindex_header_rows(self) -> Iterator[List[str]]: columns = self.obj.columns for i in range(columns.nlevels): # we need at least 1 index column to write our col names From 7b8992178be513dd5716033737c98381598a2a4f Mon Sep 17 00:00:00 2001 From: ivanovmg <41443370+ivanovmg@users.noreply.github.com> Date: Sun, 6 Sep 2020 02:28:35 +0700 Subject: [PATCH 35/37] TYP: explicitly use List type --- pandas/io/formats/csvs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e631ab2cb2301..f2dc1a1e7a811 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -109,16 +109,16 @@ def index_label(self, index_label: IndexLabel) -> None: index_label = [index_label] self._index_label = index_label - def _get_index_label_from_obj(self) -> Sequence[str]: + def _get_index_label_from_obj(self) -> List[str]: if isinstance(self.obj.index, ABCMultiIndex): return self._get_index_label_multiindex() else: return self._get_index_label_flat() - def _get_index_label_multiindex(self) -> Sequence[str]: + def _get_index_label_multiindex(self) -> List[str]: return [name or "" for name in self.obj.index.names] - def _get_index_label_flat(self) -> Sequence[str]: + def _get_index_label_flat(self) -> List[str]: index_label = self.obj.index.name return [""] if index_label is None else [index_label] From 247808404f44332f1d4938e1dfc1de8bbc17cda4 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 7 Sep 2020 01:23:17 +0700 Subject: [PATCH 36/37] TYP: correct dict typing --- pandas/io/formats/csvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index f2dc1a1e7a811..cd1877389fb04 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Hashable, Iterator, List, Optional, Sequence, Union +from typing import Any, Dict, Hashable, Iterator, List, Optional, Sequence, Union import numpy as np @@ -169,7 +169,7 @@ def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: return list(cols) @property - def _number_format(self) -> dict: + def _number_format(self) -> Dict[str, Any]: """Dictionary used for storing number formatting settings.""" return dict( na_rep=self.na_rep, From e08f6564070807c1608911a6343104def57e5393 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 7 Sep 2020 01:23:53 +0700 Subject: [PATCH 37/37] TYP: remaining properties --- pandas/io/formats/csvs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index cd1877389fb04..90ab6f61f4d74 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -218,8 +218,9 @@ def _need_to_save_header(self) -> bool: return bool(self._has_aliases or self.header) @property - def write_cols(self): + def write_cols(self) -> Sequence[Label]: if self._has_aliases: + assert not isinstance(self.header, bool) if len(self.header) != len(self.cols): raise ValueError( f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" @@ -230,10 +231,11 @@ def write_cols(self): return self.cols @property - def encoded_labels(self): - encoded_labels: List[str] = [] + def encoded_labels(self) -> List[Label]: + encoded_labels: List[Label] = [] if self.index and self.index_label: + assert isinstance(self.index_label, Sequence) encoded_labels = list(self.index_label) if not self.has_mi_columns or self._has_aliases: @@ -300,7 +302,7 @@ def _save_header(self) -> None: for row in self._generate_multiindex_header_rows(): self.writer.writerow(row) - def _generate_multiindex_header_rows(self) -> Iterator[List[str]]: + def _generate_multiindex_header_rows(self) -> Iterator[List[Label]]: columns = self.obj.columns for i in range(columns.nlevels): # we need at least 1 index column to write our col names