|
9 | 9 | from io import StringIO
|
10 | 10 |
|
11 | 11 | from pandas.core.common import adjoin, isnull, notnull
|
12 |
| -from pandas.core.index import MultiIndex, _ensure_index |
| 12 | +from pandas.core.index import Index, MultiIndex, _ensure_index |
13 | 13 | from pandas.util import py3compat
|
14 | 14 | from pandas.core.config import get_option, set_option, reset_option
|
15 | 15 | import pandas.core.common as com
|
|
18 | 18 | import numpy as np
|
19 | 19 |
|
20 | 20 | import itertools
|
| 21 | +import csv |
21 | 22 |
|
22 | 23 | from pandas.tseries.period import PeriodIndex
|
23 | 24 |
|
@@ -763,6 +764,260 @@ def grouper(x):
|
763 | 764 | return result
|
764 | 765 |
|
765 | 766 |
|
| 767 | +class CSVFormatter(object): |
| 768 | + |
| 769 | + def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, |
| 770 | + cols=None, header=True, index=True, index_label=None, |
| 771 | + mode='w', nanRep=None, encoding=None, quoting=None, |
| 772 | + line_terminator='\n', chunksize=None,legacy=False): |
| 773 | + self.legacy=legacy # remove for 0.12 |
| 774 | + self.obj = obj |
| 775 | + self.path_or_buf = path_or_buf |
| 776 | + self.sep = sep |
| 777 | + self.na_rep = na_rep |
| 778 | + self.float_format = float_format |
| 779 | + |
| 780 | + self.header = header |
| 781 | + self.index = index |
| 782 | + self.index_label = index_label |
| 783 | + self.mode = mode |
| 784 | + self.encoding = encoding |
| 785 | + |
| 786 | + if quoting is None: |
| 787 | + quoting = csv.QUOTE_MINIMAL |
| 788 | + self.quoting = quoting |
| 789 | + |
| 790 | + self.line_terminator = line_terminator |
| 791 | + |
| 792 | + if cols is None: |
| 793 | + cols = obj.columns |
| 794 | + |
| 795 | + if isinstance(cols,Index): |
| 796 | + cols = cols.to_native_types(na_rep=na_rep,float_format=float_format) |
| 797 | + else: |
| 798 | + cols=list(cols) |
| 799 | + self.cols = cols |
| 800 | + |
| 801 | + # preallocate data 2d list |
| 802 | + self.blocks = self.obj._data.blocks |
| 803 | + ncols = sum(len(b.items) for b in self.blocks) |
| 804 | + self.data =[None] * ncols |
| 805 | + |
| 806 | + # fail early if we have duplicate columns |
| 807 | + if len(set(self.cols)) != len(self.cols): |
| 808 | + raise Exception("duplicate columns are not permitted in to_csv") |
| 809 | + |
| 810 | + self.colname_map = dict((k,i) for i,k in enumerate(obj.columns)) |
| 811 | + |
| 812 | + if chunksize is None: |
| 813 | + chunksize = (100000/ (len(self.cols) or 1)) or 1 |
| 814 | + self.chunksize = chunksize |
| 815 | + |
| 816 | + self.data_index = obj.index |
| 817 | + if isinstance(obj.index, PeriodIndex): |
| 818 | + self.data_index = obj.index.to_timestamp() |
| 819 | + |
| 820 | + self.nlevels = getattr(self.data_index, 'nlevels', 1) |
| 821 | + if not index: |
| 822 | + self.nlevels = 0 |
| 823 | + |
| 824 | + # legacy to be removed in 0.12 |
| 825 | + def _helper_csv(self, writer, na_rep=None, cols=None, |
| 826 | + header=True, index=True, |
| 827 | + index_label=None, float_format=None): |
| 828 | + if cols is None: |
| 829 | + cols = self.columns |
| 830 | + |
| 831 | + series = {} |
| 832 | + for k, v in self.obj._series.iteritems(): |
| 833 | + series[k] = v.values |
| 834 | + |
| 835 | + |
| 836 | + has_aliases = isinstance(header, (tuple, list, np.ndarray)) |
| 837 | + if has_aliases or header: |
| 838 | + if index: |
| 839 | + # should write something for index label |
| 840 | + if index_label is not False: |
| 841 | + if index_label is None: |
| 842 | + if isinstance(self.obj.index, MultiIndex): |
| 843 | + index_label = [] |
| 844 | + for i, name in enumerate(self.obj.index.names): |
| 845 | + if name is None: |
| 846 | + name = '' |
| 847 | + index_label.append(name) |
| 848 | + else: |
| 849 | + index_label = self.obj.index.name |
| 850 | + if index_label is None: |
| 851 | + index_label = [''] |
| 852 | + else: |
| 853 | + index_label = [index_label] |
| 854 | + elif not isinstance(index_label, (list, tuple, np.ndarray)): |
| 855 | + # given a string for a DF with Index |
| 856 | + index_label = [index_label] |
| 857 | + |
| 858 | + encoded_labels = list(index_label) |
| 859 | + else: |
| 860 | + encoded_labels = [] |
| 861 | + |
| 862 | + if has_aliases: |
| 863 | + if len(header) != len(cols): |
| 864 | + raise ValueError(('Writing %d cols but got %d aliases' |
| 865 | + % (len(cols), len(header)))) |
| 866 | + else: |
| 867 | + write_cols = header |
| 868 | + else: |
| 869 | + write_cols = cols |
| 870 | + encoded_cols = list(write_cols) |
| 871 | + |
| 872 | + writer.writerow(encoded_labels + encoded_cols) |
| 873 | + else: |
| 874 | + encoded_cols = list(cols) |
| 875 | + writer.writerow(encoded_cols) |
| 876 | + |
| 877 | + data_index = self.obj.index |
| 878 | + if isinstance(self.obj.index, PeriodIndex): |
| 879 | + data_index = self.obj.index.to_timestamp() |
| 880 | + |
| 881 | + nlevels = getattr(data_index, 'nlevels', 1) |
| 882 | + for j, idx in enumerate(data_index): |
| 883 | + row_fields = [] |
| 884 | + if index: |
| 885 | + if nlevels == 1: |
| 886 | + row_fields = [idx] |
| 887 | + else: # handle MultiIndex |
| 888 | + row_fields = list(idx) |
| 889 | + for i, col in enumerate(cols): |
| 890 | + val = series[col][j] |
| 891 | + if lib.checknull(val): |
| 892 | + val = na_rep |
| 893 | + |
| 894 | + if float_format is not None and com.is_float(val): |
| 895 | + val = float_format % val |
| 896 | + elif isinstance(val, np.datetime64): |
| 897 | + val = lib.Timestamp(val)._repr_base |
| 898 | + |
| 899 | + row_fields.append(val) |
| 900 | + |
| 901 | + writer.writerow(row_fields) |
| 902 | + |
| 903 | + def save(self): |
| 904 | + # create the writer & save |
| 905 | + if hasattr(self.path_or_buf, 'read'): |
| 906 | + f = self.path_or_buf |
| 907 | + close = False |
| 908 | + else: |
| 909 | + f = com._get_handle(self.path_or_buf, self.mode, encoding=self.encoding) |
| 910 | + close = True |
| 911 | + |
| 912 | + try: |
| 913 | + if self.encoding is not None: |
| 914 | + self.writer = com.UnicodeWriter(f, lineterminator=self.line_terminator, |
| 915 | + delimiter=self.sep, encoding=self.encoding, |
| 916 | + quoting=self.quoting) |
| 917 | + else: |
| 918 | + self.writer = csv.writer(f, lineterminator=self.line_terminator, |
| 919 | + delimiter=self.sep, quoting=self.quoting) |
| 920 | + |
| 921 | + if self.legacy: |
| 922 | + # to be removed in 0.12 |
| 923 | + self._helper_csv(self.writer, na_rep=self.na_rep, |
| 924 | + float_format=self.float_format, cols=self.cols, |
| 925 | + header=self.header, index=self.index, |
| 926 | + index_label=self.index_label) |
| 927 | + |
| 928 | + else: |
| 929 | + self._save() |
| 930 | + |
| 931 | + |
| 932 | + finally: |
| 933 | + if close: |
| 934 | + f.close() |
| 935 | + |
| 936 | + def _save_header(self): |
| 937 | + |
| 938 | + writer = self.writer |
| 939 | + obj = self.obj |
| 940 | + index_label = self.index_label |
| 941 | + cols = self.cols |
| 942 | + header = self.header |
| 943 | + |
| 944 | + has_aliases = isinstance(header, (tuple, list, np.ndarray)) |
| 945 | + if has_aliases or self.header: |
| 946 | + if self.index: |
| 947 | + # should write something for index label |
| 948 | + if index_label is not False: |
| 949 | + if index_label is None: |
| 950 | + if isinstance(obj.index, MultiIndex): |
| 951 | + index_label = [] |
| 952 | + for i, name in enumerate(obj.index.names): |
| 953 | + if name is None: |
| 954 | + name = '' |
| 955 | + index_label.append(name) |
| 956 | + else: |
| 957 | + index_label = obj.index.name |
| 958 | + if index_label is None: |
| 959 | + index_label = [''] |
| 960 | + else: |
| 961 | + index_label = [index_label] |
| 962 | + elif not isinstance(index_label, (list, tuple, np.ndarray)): |
| 963 | + # given a string for a DF with Index |
| 964 | + index_label = [index_label] |
| 965 | + |
| 966 | + encoded_labels = list(index_label) |
| 967 | + else: |
| 968 | + encoded_labels = [] |
| 969 | + |
| 970 | + if has_aliases: |
| 971 | + if len(header) != len(cols): |
| 972 | + raise ValueError(('Writing %d cols but got %d aliases' |
| 973 | + % (len(cols), len(header)))) |
| 974 | + else: |
| 975 | + write_cols = header |
| 976 | + else: |
| 977 | + write_cols = cols |
| 978 | + encoded_cols = list(write_cols) |
| 979 | + |
| 980 | + writer.writerow(encoded_labels + encoded_cols) |
| 981 | + else: |
| 982 | + encoded_cols = list(cols) |
| 983 | + writer.writerow(encoded_cols) |
| 984 | + |
| 985 | + def _save(self): |
| 986 | + |
| 987 | + self._save_header() |
| 988 | + |
| 989 | + nrows = len(self.data_index) |
| 990 | + |
| 991 | + # write in chunksize bites |
| 992 | + chunksize = self.chunksize |
| 993 | + chunks = int(nrows / chunksize)+1 |
| 994 | + |
| 995 | + for i in xrange(chunks): |
| 996 | + start_i = i * chunksize |
| 997 | + end_i = min((i + 1) * chunksize, nrows) |
| 998 | + if start_i >= end_i: |
| 999 | + break |
| 1000 | + |
| 1001 | + self._save_chunk(start_i, end_i) |
| 1002 | + |
| 1003 | + def _save_chunk(self, start_i, end_i): |
| 1004 | + |
| 1005 | + colname_map = self.colname_map |
| 1006 | + data_index = self.data_index |
| 1007 | + |
| 1008 | + # create the data for a chunk |
| 1009 | + slicer = slice(start_i,end_i) |
| 1010 | + for i in range(len(self.blocks)): |
| 1011 | + b = self.blocks[i] |
| 1012 | + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) |
| 1013 | + for j, k in enumerate(b.items): |
| 1014 | + # self.data is a preallocated list |
| 1015 | + self.data[colname_map[k]] = d[j] |
| 1016 | + |
| 1017 | + ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format) |
| 1018 | + |
| 1019 | + lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) |
| 1020 | + |
766 | 1021 | # from collections import namedtuple
|
767 | 1022 | # ExcelCell = namedtuple("ExcelCell",
|
768 | 1023 | # 'row, col, val, style, mergestart, mergeend')
|
|
0 commit comments