|
| 1 | +""" parquet compat """ |
| 2 | + |
| 3 | +from warnings import catch_warnings |
| 4 | +from distutils.version import LooseVersion |
| 5 | +from pandas import DataFrame, RangeIndex, Int64Index, get_option |
| 6 | +from pandas.compat import range |
| 7 | +from pandas.io.common import get_filepath_or_buffer |
| 8 | + |
| 9 | + |
| 10 | +def get_engine(engine): |
| 11 | + """ return our implementation """ |
| 12 | + |
| 13 | + if engine is None: |
| 14 | + engine = get_option('io.parquet.engine') |
| 15 | + |
| 16 | + if engine not in ['pyarrow', 'fastparquet']: |
| 17 | + raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") |
| 18 | + |
| 19 | + if engine == 'pyarrow': |
| 20 | + return PyArrowImpl() |
| 21 | + elif engine == 'fastparquet': |
| 22 | + return FastParquetImpl() |
| 23 | + |
| 24 | + |
| 25 | +class PyArrowImpl(object): |
| 26 | + |
| 27 | + def __init__(self): |
| 28 | + # since pandas is a dependency of pyarrow |
| 29 | + # we need to import on first use |
| 30 | + |
| 31 | + try: |
| 32 | + import pyarrow |
| 33 | + import pyarrow.parquet |
| 34 | + except ImportError: |
| 35 | + raise ImportError("pyarrow is required for parquet support\n\n" |
| 36 | + "you can install via conda\n" |
| 37 | + "conda install pyarrow -c conda-forge\n" |
| 38 | + "\nor via pip\n" |
| 39 | + "pip install pyarrow\n") |
| 40 | + |
| 41 | + if LooseVersion(pyarrow.__version__) < '0.4.1': |
| 42 | + raise ImportError("pyarrow >= 0.4.1 is required for parquet" |
| 43 | + "support\n\n" |
| 44 | + "you can install via conda\n" |
| 45 | + "conda install pyarrow -c conda-forge\n" |
| 46 | + "\nor via pip\n" |
| 47 | + "pip install pyarrow\n") |
| 48 | + |
| 49 | + self.api = pyarrow |
| 50 | + |
| 51 | + def write(self, df, path, compression='snappy', **kwargs): |
| 52 | + path, _, _ = get_filepath_or_buffer(path) |
| 53 | + table = self.api.Table.from_pandas(df, timestamps_to_ms=True) |
| 54 | + self.api.parquet.write_table( |
| 55 | + table, path, compression=compression, **kwargs) |
| 56 | + |
| 57 | + def read(self, path): |
| 58 | + path, _, _ = get_filepath_or_buffer(path) |
| 59 | + return self.api.parquet.read_table(path).to_pandas() |
| 60 | + |
| 61 | + |
| 62 | +class FastParquetImpl(object): |
| 63 | + |
| 64 | + def __init__(self): |
| 65 | + # since pandas is a dependency of fastparquet |
| 66 | + # we need to import on first use |
| 67 | + |
| 68 | + try: |
| 69 | + import fastparquet |
| 70 | + except ImportError: |
| 71 | + raise ImportError("fastparquet is required for parquet support\n\n" |
| 72 | + "you can install via conda\n" |
| 73 | + "conda install fastparquet -c conda-forge\n" |
| 74 | + "\nor via pip\n" |
| 75 | + "pip install fastparquet") |
| 76 | + |
| 77 | + if LooseVersion(fastparquet.__version__) < '0.1.0': |
| 78 | + raise ImportError("fastparquet >= 0.1.0 is required for parquet " |
| 79 | + "support\n\n" |
| 80 | + "you can install via conda\n" |
| 81 | + "conda install fastparquet -c conda-forge\n" |
| 82 | + "\nor via pip\n" |
| 83 | + "pip install fastparquet") |
| 84 | + |
| 85 | + self.api = fastparquet |
| 86 | + |
| 87 | + def write(self, df, path, compression='snappy', **kwargs): |
| 88 | + # thriftpy/protocol/compact.py:339: |
| 89 | + # DeprecationWarning: tostring() is deprecated. |
| 90 | + # Use tobytes() instead. |
| 91 | + path, _, _ = get_filepath_or_buffer(path) |
| 92 | + with catch_warnings(record=True): |
| 93 | + self.api.write(path, df, |
| 94 | + compression=compression, **kwargs) |
| 95 | + |
| 96 | + def read(self, path): |
| 97 | + path, _, _ = get_filepath_or_buffer(path) |
| 98 | + return self.api.ParquetFile(path).to_pandas() |
| 99 | + |
| 100 | + |
| 101 | +def to_parquet(df, path, engine=None, compression='snappy', **kwargs): |
| 102 | + """ |
| 103 | + Write a DataFrame to the parquet format. |
| 104 | +
|
| 105 | + Parameters |
| 106 | + ---------- |
| 107 | + df : DataFrame |
| 108 | + path : string |
| 109 | + File path |
| 110 | + engine : str, optional |
| 111 | + The parquet engine, one of {'pyarrow', 'fastparquet'} |
| 112 | + if None, will use the option: `io.parquet.engine` |
| 113 | + compression : str, optional, default 'snappy' |
| 114 | + compression method, includes {'gzip', 'snappy', 'brotli'} |
| 115 | + kwargs are passed to the engine |
| 116 | + """ |
| 117 | + |
| 118 | + impl = get_engine(engine) |
| 119 | + |
| 120 | + if not isinstance(df, DataFrame): |
| 121 | + raise ValueError("to_parquet only support IO with DataFrames") |
| 122 | + |
| 123 | + valid_types = {'string', 'unicode'} |
| 124 | + |
| 125 | + # validate index |
| 126 | + # -------------- |
| 127 | + |
| 128 | + # validate that we have only a default index |
| 129 | + # raise on anything else as we don't serialize the index |
| 130 | + |
| 131 | + if not isinstance(df.index, Int64Index): |
| 132 | + raise ValueError("parquet does not support serializing {} " |
| 133 | + "for the index; you can .reset_index()" |
| 134 | + "to make the index into column(s)".format( |
| 135 | + type(df.index))) |
| 136 | + |
| 137 | + if not df.index.equals(RangeIndex.from_range(range(len(df)))): |
| 138 | + raise ValueError("parquet does not support serializing a " |
| 139 | + "non-default index for the index; you " |
| 140 | + "can .reset_index() to make the index " |
| 141 | + "into column(s)") |
| 142 | + |
| 143 | + if df.index.name is not None: |
| 144 | + raise ValueError("parquet does not serialize index meta-data on a " |
| 145 | + "default index") |
| 146 | + |
| 147 | + # validate columns |
| 148 | + # ---------------- |
| 149 | + |
| 150 | + # must have value column names (strings only) |
| 151 | + if df.columns.inferred_type not in valid_types: |
| 152 | + raise ValueError("parquet must have string column names") |
| 153 | + |
| 154 | + return impl.write(df, path, compression=compression) |
| 155 | + |
| 156 | + |
| 157 | +def read_parquet(path, engine=None, **kwargs): |
| 158 | + """ |
| 159 | + Load a parquet object from the file path, returning a DataFrame. |
| 160 | +
|
| 161 | + .. versionadded 0.21.0 |
| 162 | +
|
| 163 | + Parameters |
| 164 | + ---------- |
| 165 | + path : string |
| 166 | + File path |
| 167 | + engine : str, optional |
| 168 | + The parquet engine, one of {'pyarrow', 'fastparquet'} |
| 169 | + if None, will use the option: `io.parquet.engine` |
| 170 | + kwargs are passed to the engine |
| 171 | +
|
| 172 | + Returns |
| 173 | + ------- |
| 174 | + DataFrame |
| 175 | +
|
| 176 | + """ |
| 177 | + |
| 178 | + impl = get_engine(engine) |
| 179 | + return impl.read(path) |
0 commit comments