add private impl classes

jreback · jreback · commit 52ff11357628 · 2017-03-31T11:52:34.000-04:00
pass thru kwargs to reader/writer
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1520,7 +1520,8 @@ def to_feather(self, fname):
         from pandas.io.feather_format import to_feather
         to_feather(self, fname)
 
-    def to_parquet(self, fname, engine, compression=None):
+    def to_parquet(self, fname, engine, compression=None,
+                   **kwargs):
         """
         write out the binary parquet for DataFrames
 
@@ -1534,10 +1535,12 @@ def to_parquet(self, fname, engine, compression=None):
             supported are {'pyarrow', 'fastparquet'}
         compression : str, optional
             compression method, includes {'gzip', 'snappy', 'brotli'}
+        kwargs passed to the engine
 
         """
         from pandas.io.parquet import to_parquet
-        to_parquet(self, fname, engine, compression=compression)
+        to_parquet(self, fname, engine,
+                   compression=compression, **kwargs)
 
     @Substitution(header='Write out column names. If a list of string is given, \
 it is assumed to be aliases for the column names')
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -5,44 +5,78 @@
 from pandas.compat import range
 
 
-def _try_import_pyarrow():
-    # since pandas is a dependency of pyarrow
-    # we need to import on first use
+def get_engine(engine):
+    """ return our implementation """
 
-    try:
+    if engine not in ['pyarrow', 'fastparquet']:
+        raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
+
+    if engine == 'pyarrow':
+        return PyArrowImpl()
+    elif engine == 'fastparquet':
+        return FastParquetImpl()
+
+
+class PyArrowImpl(object):
+
+    def __init__(self):
+        # since pandas is a dependency of pyarrow
+        # we need to import on first use
+
+        try:
+            import pyarrow  # noqa
+        except ImportError:
+            raise ImportError("pyarrow is required for parquet support\n\n"
+                              "you can install via conda\n"
+                              "conda install pyarrow -c conda-forge\n"
+                              "\nor via pip\n"
+                              "pip install pyarrow\n")
+
+    def write(self, df, path, compression=None, **kwargs):
         import pyarrow
-    except ImportError:
-        raise ImportError("pyarrow is required for parquet support\n\n"
-                          "you can install via conda\n"
-                          "conda install pyarrow -c conda-forge\n"
-                          "\nor via pip\n"
-                          "pip install pyarrow\n")
+        from pyarrow import parquet as pq
 
-    return pyarrow
+        table = pyarrow.Table.from_pandas(df)
+        pq.write_table(table, path,
+                       compression=compression, **kwargs)
 
+    def read(self, path):
+        import pyarrow
+        return pyarrow.parquet.read_table(path).to_pandas()
 
-def _try_import_fastparquet():
-    # since pandas is a dependency of fastparquet
-    # we need to import on first use
 
-    try:
-        import fastparquet
-    except ImportError:
-        raise ImportError("fastparquet is required for parquet support\n\n"
-                          "you can install via conda\n"
-                          "conda install fastparquet -c conda-forge\n"
-                          "\nor via pip\n"
-                          "pip install fastparquet")
+class FastParquetImpl(object):
 
-    return fastparquet
+    def __init__(self):
+        # since pandas is a dependency of fastparquet
+        # we need to import on first use
 
+        try:
+            import fastparquet  # noqa
+        except ImportError:
+            raise ImportError("fastparquet is required for parquet support\n\n"
+                              "you can install via conda\n"
+                              "conda install fastparquet -c conda-forge\n"
+                              "\nor via pip\n"
+                              "pip install fastparquet")
 
-def _validate_engine(engine):
-    if engine not in ['pyarrow', 'fastparquet']:
-        raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
+    def write(self, df, path, compression=None, **kwargs):
+        import fastparquet
 
+        # thriftpy/protocol/compact.py:339:
+        # DeprecationWarning: tostring() is deprecated.
+        # Use tobytes() instead.
+        with catch_warnings(record=True):
+            fastparquet.write(path, df,
+                              compression=compression, **kwargs)
 
-def to_parquet(df, path, engine, compression=None):
+    def read(self, path):
+        import fastparquet
+        pf = fastparquet.ParquetFile(path)
+        return pf.to_pandas()
+
+
+def to_parquet(df, path, engine, compression=None, **kwargs):
     """
     Write a DataFrame to the pyarrow
 
@@ -55,9 +89,10 @@ def to_parquet(df, path, engine, compression=None):
         supported are {'pyarrow', 'fastparquet'}
     compression : str, optional
         compression method, includes {'gzip', 'snappy', 'brotli'}
+    kwargs are passed to the engine
     """
 
-    _validate_engine(engine)
+    impl = get_engine(engine)
 
     if not isinstance(df, DataFrame):
         raise ValueError("to_parquet only support IO with DataFrames")
@@ -92,24 +127,10 @@ def to_parquet(df, path, engine, compression=None):
     if df.columns.inferred_type not in valid_types:
         raise ValueError("parquet must have string column names")
 
-    if engine == 'pyarrow':
-        pyarrow = _try_import_pyarrow()
-        from pyarrow import parquet as pq
-
-        table = pyarrow.Table.from_pandas(df)
-        pq.write_table(table, path, compression=compression)
-
-    elif engine == 'fastparquet':
-        fastparquet = _try_import_fastparquet()
-
-        # thriftpy/protocol/compact.py:339:
-        # DeprecationWarning: tostring() is deprecated.
-        # Use tobytes() instead.
-        with catch_warnings(record=True):
-            fastparquet.write(path, df, compression=compression)
+    return impl.write(df, path, compression=compression)
 
 
-def read_parquet(path, engine):
+def read_parquet(path, engine, **kwargs):
     """
     Load a parquet object from the file path
 
@@ -121,20 +142,13 @@ def read_parquet(path, engine):
         File path
     engine : parquet engine
         supported are {'pyarrow', 'fastparquet'}
+    kwargs are passed to the engine
 
     Returns
     -------
     type of object stored in file
 
     """
 
-    _validate_engine(engine)
-
-    if engine == 'pyarrow':
-        pyarrow = _try_import_pyarrow()
-        return pyarrow.parquet.read_table(path).to_pandas()
-
-    elif engine == 'fastparquet':
-        fastparquet = _try_import_fastparquet()
-        pf = fastparquet.ParquetFile(path)
-        return pf.to_pandas()
+    impl = get_engine(engine)
+    return impl.read(path)