Moved validation logic to a base class

dhirschfeld · dhirschfeld · commit 622c1ed8f65e · 2017-12-05T21:29:55.000+10:00
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -34,7 +34,50 @@ def get_engine(engine):
         return FastParquetImpl()
 
 
-class PyArrowImpl(object):
+class BaseImpl(object):
+
+    api = None  # module
+
+    @staticmethod
+    def _validate_index(df):
+        if not isinstance(df.index, Int64Index):
+            msg = (
+                "parquet does not support serializing {} for the index;"
+                "you can .reset_index() to make the index into column(s)"
+            )
+            raise ValueError(msg.format(type(df.index)))
+        if not df.index.equals(RangeIndex(len(df))):
+            raise ValueError(
+                "parquet does not support serializing a non-default index "
+                "for the index; you can .reset_index() to make the index "
+                "into column(s)"
+            )
+        if df.index.name is not None:
+            raise ValueError(
+                "parquet does not serialize index meta-data "
+                "on a default index"
+            )
+
+    @staticmethod
+    def _validate_columns(df):
+        # must have value column names (strings only)
+        if df.columns.inferred_type not in {'string', 'unicode'}:
+            raise ValueError("parquet must have string column names")
+
+    def validate_dataframe(self, df):
+        if not isinstance(df, DataFrame):
+            raise ValueError("to_parquet only support IO with DataFrames")
+        self._validate_columns(df)
+        self._validate_index(df)
+
+    def write(self, df, path, compression, **kwargs):
+        raise NotImplementedError()
+
+    def read(self, path, columns=None, **kwargs):
+        raise NotImplementedError()
+
+
+class PyArrowImpl(BaseImpl):
 
     def __init__(self):
         # since pandas is a dependency of pyarrow
@@ -63,8 +106,14 @@ def __init__(self):
         self._pyarrow_lt_070 = LooseVersion(pyarrow.__version__) < '0.7.0'
         self.api = pyarrow
 
+    def _validate_index(self, df):
+        # pyarrow >= 0.7.0 supports multi-indexes so no need to validate
+        if self._pyarrow_lt_070:
+            super(PyArrowImpl, self)._validate_index(df)
+
     def write(self, df, path, compression='snappy',
               coerce_timestamps='ms', **kwargs):
+        self.validate_dataframe(df)
         path, _, _ = get_filepath_or_buffer(path)
         if self._pyarrow_lt_060:
             table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
@@ -83,12 +132,11 @@ def read(self, path, columns=None, **kwargs):
                                            **kwargs).to_pandas()
 
 
-class FastParquetImpl(object):
+class FastParquetImpl(BaseImpl):
 
     def __init__(self):
         # since pandas is a dependency of fastparquet
         # we need to import on first use
-
         try:
             import fastparquet
         except ImportError:
@@ -109,6 +157,7 @@ def __init__(self):
         self.api = fastparquet
 
     def write(self, df, path, compression='snappy', **kwargs):
+        self.validate_dataframe(df)
         # thriftpy/protocol/compact.py:339:
         # DeprecationWarning: tostring() is deprecated.
         # Use tobytes() instead.
@@ -140,46 +189,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
     kwargs
         Additional keyword arguments passed to the engine
     """
-
     impl = get_engine(engine)
-
-    if not isinstance(df, DataFrame):
-        raise ValueError("to_parquet only support IO with DataFrames")
-
-    valid_types = {'string', 'unicode'}
-
-    # validate that we have only a default index
-    # raise on anything else as we don't serialize the index
-    # *unless* we're using pyarrow >= 0.7.1 which does support multi-indexes
-    if impl.api.__name__ == 'pyarrow' and not impl._pyarrow_lt_070:
-        validate_index = False
-    else:
-        validate_index = True
-
-    if validate_index:
-        if not isinstance(df.index, Int64Index):
-            raise ValueError("parquet does not support serializing {} "
-                             "for the index; you can .reset_index()"
-                             "to make the index into column(s)".format(
-                                 type(df.index)))
-
-        if not df.index.equals(RangeIndex.from_range(range(len(df)))):
-            raise ValueError("parquet does not support serializing a "
-                             "non-default index for the index; you "
-                             "can .reset_index() to make the index "
-                             "into column(s)")
-
-        if df.index.name is not None:
-            raise ValueError("parquet does not serialize index meta-data on a "
-                             "default index")
-
-    # validate columns
-    # ----------------
-
-    # must have value column names (strings only)
-    if df.columns.inferred_type not in valid_types:
-        raise ValueError("parquet must have string column names")
-
     return impl.write(df, path, compression=compression, **kwargs)