Update documentation

anjsudh · anjsudh · commit 16366815b4a9 · 2018-10-29T17:31:34.000+05:30
using with to clean directory
support for fastparquet
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4574,6 +4574,8 @@ Several caveats.
 * Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
+* ``partition_cols`` will be used for partitioning the dataset, where the dataset will be written to multiple
+  files in the path specified. Therefore, the path specified, must be a directory path.
 
 You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
 If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1970,7 +1970,7 @@ def to_feather(self, fname):
         to_feather(self, fname)
 
     def to_parquet(self, fname, engine='auto', compression='snappy',
-                   index=None, **kwargs):
+                   index=None, partition_cols=None, **kwargs):
         """
         Write a DataFrame to the binary parquet format.
 
@@ -1984,7 +1984,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         Parameters
         ----------
         fname : str
-            String file path.
+            File path or Root Directory path. Will be used as Root Directory
+            path while writing a partitioned dataset.
         engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
             Parquet library to use. If 'auto', then the option
             ``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -1998,6 +1999,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
             the behavior depends on the chosen engine.
 
             .. versionadded:: 0.24.0
+        partition_cols : list, optional, default None
+            Column names by which to partition the dataset
+            Columns are partitioned in the order they are given
+            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
+            For other versions, this argument will be ignored.
+            .. versionadded:: 0.24.0
 
         **kwargs
             Additional arguments passed to the parquet library. See
@@ -2027,7 +2034,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         """
         from pandas.io.parquet import to_parquet
         to_parquet(self, fname, engine,
-                   compression=compression, index=index, **kwargs)
+                   compression=compression, index=index,
+                   partition_cols=partition_cols, **kwargs)
 
     @Substitution(header='Write out the column names. If a list of strings '
                          'is given, it is assumed to be aliases for the '
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -104,7 +104,8 @@ def __init__(self):
         self.api = pyarrow
 
     def write(self, df, path, compression='snappy',
-              coerce_timestamps='ms', index=None, **kwargs):
+              coerce_timestamps='ms', index=None, partition_cols=None,
+              **kwargs):
         self.validate_dataframe(df)
 
         # Only validate the index if we're writing it.
@@ -125,10 +126,11 @@ def write(self, df, path, compression='snappy',
 
         else:
             table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
-            if 'partition_cols' in kwargs:
+            if partition_cols is not None:
                 self.api.parquet.write_to_dataset(
                     table, path, compression=compression,
-                    coerce_timestamps=coerce_timestamps, **kwargs)
+                    coerce_timestamps=coerce_timestamps,
+                    partition_cols=partition_cols, **kwargs)
             else:
                 self.api.parquet.write_table(
                     table, path, compression=compression,
@@ -211,12 +213,16 @@ def __init__(self):
             )
         self.api = fastparquet
 
-    def write(self, df, path, compression='snappy', index=None, **kwargs):
+    def write(self, df, path, compression='snappy', index=None,
+              partition_cols=None, **kwargs):
         self.validate_dataframe(df)
         # thriftpy/protocol/compact.py:339:
         # DeprecationWarning: tostring() is deprecated.
         # Use tobytes() instead.
 
+        if partition_cols is not None:
+            kwargs['file_scheme'] = 'hive'
+
         if is_s3_url(path):
             # path is s3:// so we need to open the s3file in 'wb' mode.
             # TODO: Support 'ab'
@@ -229,7 +235,8 @@ def write(self, df, path, compression='snappy', index=None, **kwargs):
 
         with catch_warnings(record=True):
             self.api.write(path, df, compression=compression,
-                           write_index=index, **kwargs)
+                           write_index=index, partition_on=partition_cols,
+                           **kwargs)
 
     def read(self, path, columns=None, **kwargs):
         if is_s3_url(path):
@@ -249,16 +256,15 @@ def read(self, path, columns=None, **kwargs):
 
 
 def to_parquet(df, path, engine='auto', compression='snappy', index=None,
-               **kwargs):
+               partition_cols=None, **kwargs):
     """
     Write a DataFrame to the parquet format.
 
     Parameters
     ----------
-    df : DataFrame
-    path : string
-        File path ( Will be used as `root_path` if
-        `partition_cols` is provided as parameter for 'pyarrow' engine).
+    path : str
+        File path or Root Directory path. Will be used as Root Directory path
+        while writing a partitioned dataset.
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet library to use. If 'auto', then the option
         ``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -272,11 +278,18 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
         engine's default behavior will be used.
 
         .. versionadded 0.24.0
+    partition_cols : list, optional
+            Column names by which to partition the dataset
+            Columns are partitioned in the order they are given
+            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
+            For other versions, this argument will be ignored.
+            .. versionadded:: 0.24.0
     kwargs
         Additional keyword arguments passed to the engine
     """
     impl = get_engine(engine)
-    return impl.write(df, path, compression=compression, index=index, **kwargs)
+    return impl.write(df, path, compression=compression, index=index,
+                      partition_cols=partition_cols, **kwargs)
 
 
 def read_parquet(path, engine='auto', columns=None, **kwargs):
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1,8 +1,7 @@
 """ test parquet compat """
+import os
 
 import pytest
-import tempfile
-import shutil
 import datetime
 from distutils.version import LooseVersion
 from warnings import catch_warnings
@@ -481,18 +480,19 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
                          path='s3://pandas-test/pyarrow.parquet')
 
     def test_partition_cols_supported(self, pa_ge_070, df_full):
+        # GH #23283
         partition_cols = ['bool', 'int']
         df = df_full
-        path = tempfile.mkdtemp()
-        df.to_parquet(path, partition_cols=partition_cols,
-                      compression=None)
-        import pyarrow.parquet as pq
-        dataset = pq.ParquetDataset(path, validate_schema=False)
-        assert len(dataset.partitions.partition_names) == 2
-        assert dataset.partitions.partition_names == set(partition_cols)
-        shutil.rmtree(path)
+        with tm.ensure_clean_dir() as path:
+            df.to_parquet(path, partition_cols=partition_cols,
+                          compression=None)
+            import pyarrow.parquet as pq
+            dataset = pq.ParquetDataset(path, validate_schema=False)
+            assert len(dataset.partitions.partition_names) == 2
+            assert dataset.partitions.partition_names == set(partition_cols)
 
     def test_ignore_partition_cols_lt_070(self, pa_lt_070, df_full):
+        # GH #23283
         partition_cols = ['bool', 'int']
         pa = pa_lt_070
         df = df_full
@@ -564,3 +564,15 @@ def test_s3_roundtrip(self, df_compat, s3_resource, fp):
         # GH #19134
         check_round_trip(df_compat, fp,
                          path='s3://pandas-test/fastparquet.parquet')
+
+    def test_partition_cols_supported(self, fp, df_full):
+        # GH #23283
+        partition_cols = ['bool', 'int']
+        df = df_full
+        with tm.ensure_clean_dir() as path:
+            df.to_parquet(path, partition_cols=partition_cols,
+                          compression=None)
+            assert os.path.exists(path)
+            import fastparquet
+            actual_partition_cols = fastparquet.ParquetFile(path, False).cats
+            assert len(actual_partition_cols) == 2
diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py
@@ -875,3 +875,12 @@ def test_datapath_missing(datapath, request):
     )
 
     assert result == expected
+
+
+def test_create_temp_directory():
+    temppath = ''
+    with tm.ensure_clean_dir() as path:
+        assert os.path.exists(path)
+        assert os.path.isdir(path)
+        temppath = path
+    assert not os.path.exists(temppath)
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -772,6 +772,23 @@ def ensure_clean(filename=None, return_filelike=False):
                 print("Exception on removing file: {error}".format(error=e))
 
 
+@contextmanager
+def ensure_clean_dir():
+    """
+    Get a temporary directory path and agrees to remove on close.
+
+    Yields
+    ----------
+    Temporary directory path
+    """
+    directory_name = tempfile.mkdtemp(suffix='')
+    try:
+        yield directory_name
+    finally:
+        import shutil
+        shutil.rmtree(directory_name)
+
+
 # -----------------------------------------------------------------------------
 # Comparators