From 47caeb8af2aeb2a9ce49a1a003a7667da5a004a8 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Tue, 9 May 2017 21:30:28 -0400
Subject: [PATCH 01/10] Draft metadata specification doc for Apache Parquet

---
 doc/source/index.rst.template |  1 +
 doc/source/metadata.rst       | 94 +++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 doc/source/metadata.rst
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
index f5c65e175b0db..01a1d4f795c19 100644
--- a/doc/source/index.rst.template
+++ b/doc/source/index.rst.template
@@ -146,6 +146,7 @@ See the package overview for more detail about what's in the library.
     comparison_with_r
     comparison_with_sql
     comparison_with_sas
+    metadata
     {% endif -%}
     {% if api -%}
     api
diff --git a/doc/source/metadata.rst b/doc/source/metadata.rst
new file mode 100644
index 0000000000000..37e89a9f33039
--- /dev/null
+++ b/doc/source/metadata.rst
@@ -0,0 +1,94 @@
+.. _metadata:
+
+.. currentmodule:: pandas
+
+**********************************************
+Storing pandas Objects in Various File Formats
+**********************************************
+
+This document provides specifications for metadata to assist with reading and
+writing pandas objects to different third party file formats.
+
+Apache Parquet
+--------------
+
+The Apache Parquet format provides key-value metadata at the file and column
+level, stored in the footer of the Parquet file:
+
+.. code-block:: shell
+
+  5: optional list<KeyValue> key_value_metadata
+
+where ``KeyValue`` is
+
+.. code-block:: shell
+
+   struct KeyValue {
+     1: required string key
+     2: optional string value
+   }
+
+So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
+``pandas`` metadata key with the the value stored as :
+
+.. code-block:: text
+
+   {'index_columns': ['__index_level_0__', '__index_level_1__', ...],
+    'columns': [<c0>, <c1>, ...]}
+
+Here, ``<c0>`` and so forth are dictionaries containing the metadata for each
+column. This has JSON form:
+
+.. code-block:: text
+
+   {'name': column_name,
+    'type': pandas_type,
+    'numpy_dtype': numpy_type,
+    'metadata': type_metadata}
+
+``pandas_type`` is the logical type of the column, and is one of:
+
+* Boolean: ``'bool'``
+* Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'``
+* Floats: ``'float16', 'float32', 'float64'``
+* Datetime: ``'datetime', 'datetimetz'``
+* String: ``'unicode', 'bytes'``
+* Categorical: ``'categorical'``
+
+The ``numpy_type`` is the physical storage type of the column, which is the
+result of ``str(dtype)`` for the underlying NumPy array that holds the data. So
+for ``datetimetz`` this is ``datetime64[ns]`` and for categorical, it may be
+any of the supported integer categorical types.
+
+The ``type_metadata`` is ``None`` except for:
+
+* ``datetimetz``: ``{'timezone': zone}``, e.g. ``{'timezone', 'America/New_York'}``
+* ``categorical``: ``{'num_categories': K}``
+
+As an example of fully-formed metadata:
+
+.. code-block:: text
+
+   {'index_columns': ['__index_level_0__'],
+    'columns': [
+        {'name': 'c0',
+         'type': 'int8',
+         'numpy_type': 'int8',
+         'metadata': None},
+        {'name': 'c1',
+         'type': 'string',
+         'numpy_type': 'object',
+         'metadata': None},
+        {'name': 'c2',
+         'type': 'categorical',
+         'numpy_type': 'int16',
+         'metadata': {'num_categories': 1000}},
+        {'name': 'c3',
+         'type': 'datetimetz',
+         'numpy_type': 'datetime64[ns]',
+         'metadata': {'timezone': 'America/Los_Angeles'}},
+        {'name': '__index_level_0__',
+         'type': 'int64',
+         'numpy_type': 'int64',
+         'metadata': None}
+    ]}

From 0c57d656223de28d0dfb458c285060e5846e9c8d Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Tue, 9 May 2017 21:33:06 -0400
Subject: [PATCH 02/10] Tweaks, add pandas version

---
 doc/source/metadata.rst | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/source/metadata.rst b/doc/source/metadata.rst
index 37e89a9f33039..a829ffa655e7f 100644
--- a/doc/source/metadata.rst
+++ b/doc/source/metadata.rst
@@ -34,7 +34,8 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
 .. code-block:: text
 
    {'index_columns': ['__index_level_0__', '__index_level_1__', ...],
-    'columns': [<c0>, <c1>, ...]}
+    'columns': [<c0>, <c1>, ...],
+    'pandas_version': '0.20.0'}
 
 Here, ``<c0>`` and so forth are dictionaries containing the metadata for each
 column. This has JSON form:
@@ -76,7 +77,7 @@ As an example of fully-formed metadata:
          'numpy_type': 'int8',
          'metadata': None},
         {'name': 'c1',
-         'type': 'string',
+         'type': 'bytes',
          'numpy_type': 'object',
          'metadata': None},
         {'name': 'c2',
@@ -91,4 +92,5 @@ As an example of fully-formed metadata:
          'type': 'int64',
          'numpy_type': 'int64',
          'metadata': None}
-    ]}
+    ],
+    'pandas_version': '0.20.0'}

From 656acbef5a0692b222aa8f2284ba7231dbe191d0 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Tue, 9 May 2017 21:38:51 -0400
Subject: [PATCH 03/10] Relax metadata key

---
 doc/source/metadata.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/source/metadata.rst b/doc/source/metadata.rst
index a829ffa655e7f..e59651c774a7f 100644
--- a/doc/source/metadata.rst
+++ b/doc/source/metadata.rst
@@ -66,6 +66,9 @@ The ``type_metadata`` is ``None`` except for:
 * ``datetimetz``: ``{'timezone': zone}``, e.g. ``{'timezone', 'America/New_York'}``
 * ``categorical``: ``{'num_categories': K}``
 
+For types other than these, the ``'metadata'`` key can be
+omitted. Implementations can assume ``None`` if the key is not present.
+
 As an example of fully-formed metadata:
 
 .. code-block:: text

From 17c6ba39e95db790d15e1f0ddfffde008bf764f5 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Tue, 9 May 2017 21:41:35 -0400
Subject: [PATCH 04/10] Be explicit that the metadata is file-level

---
 doc/source/metadata.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/metadata.rst b/doc/source/metadata.rst
index e59651c774a7f..e939c58a492a4 100644
--- a/doc/source/metadata.rst
+++ b/doc/source/metadata.rst
@@ -29,7 +29,7 @@ where ``KeyValue`` is
    }
 
 So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
-``pandas`` metadata key with the the value stored as :
+``pandas`` metadata key in the ``FileMetaData`` with the the value stored as :
 
 .. code-block:: text
 

From 2155ea928edd3668865687b81c23f55ff6a2c939 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Tue, 9 May 2017 21:42:17 -0400
Subject: [PATCH 05/10] Don't hard code version

---
 doc/source/metadata.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/metadata.rst b/doc/source/metadata.rst
index e939c58a492a4..bd1bd125e3b51 100644
--- a/doc/source/metadata.rst
+++ b/doc/source/metadata.rst
@@ -35,7 +35,7 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
 
    {'index_columns': ['__index_level_0__', '__index_level_1__', ...],
     'columns': [<c0>, <c1>, ...],
-    'pandas_version': '0.20.0'}
+    'pandas_version': $VERSION}
 
 Here, ``<c0>`` and so forth are dictionaries containing the metadata for each
 column. This has JSON form:

From d2c66d8cba256e3c827342a69429f96d998c1257 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Thu, 11 May 2017 08:30:56 -0400
Subject: [PATCH 06/10] Code reviews

---
 doc/source/metadata.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/source/metadata.rst b/doc/source/metadata.rst
index bd1bd125e3b51..eb865b71b404e 100644
--- a/doc/source/metadata.rst
+++ b/doc/source/metadata.rst
@@ -12,8 +12,9 @@ writing pandas objects to different third party file formats.
 Apache Parquet
 --------------
 
-The Apache Parquet format provides key-value metadata at the file and column
-level, stored in the footer of the Parquet file:
+The `Apache Parquet <https://github.com/apache/parquet-format>`__ format
+provides key-value metadata at the file and column level, stored in the footer
+of the Parquet file:
 
 .. code-block:: shell
 
@@ -64,7 +65,7 @@ any of the supported integer categorical types.
 The ``type_metadata`` is ``None`` except for:
 
 * ``datetimetz``: ``{'timezone': zone}``, e.g. ``{'timezone', 'America/New_York'}``
-* ``categorical``: ``{'num_categories': K}``
+* ``categorical``: ``{'num_categories': K, 'ordered': is_ordered}``
 
 For types other than these, the ``'metadata'`` key can be
 omitted. Implementations can assume ``None`` if the key is not present.

From e0a176e92ba878c3937e8cb8636f09c137ed6993 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Mon, 15 May 2017 21:49:51 -0400
Subject: [PATCH 07/10] Move Parquet metadata to developer.rst, account for
 code reviews

---
 doc/source/developer.rst | 104 +++++++++++++++++++++++++++++++++++++++
 doc/source/metadata.rst  | 100 -------------------------------------
 2 files changed, 104 insertions(+), 100 deletions(-)
 delete mode 100644 doc/source/metadata.rst

diff --git a/doc/source/developer.rst b/doc/source/developer.rst
index 7633b4732479c..bc2e158b1b3b8 100644
--- a/doc/source/developer.rst
+++ b/doc/source/developer.rst
@@ -16,3 +16,107 @@ Developer
 *********
 
 This section will focus on downstream applications of pandas.
+
+Storing pandas DataFrame objects in Apache Parquet format
+---------------------------------------------------------
+
+The `Apache Parquet <https://github.com/apache/parquet-format>`__ format
+provides key-value metadata at the file and column level, stored in the footer
+of the Parquet file:
+
+.. code-block:: shell
+
+  5: optional list<KeyValue> key_value_metadata
+
+where ``KeyValue`` is
+
+.. code-block:: shell
+
+   struct KeyValue {
+     1: required string key
+     2: optional string value
+   }
+
+So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
+``pandas`` metadata key in the ``FileMetaData`` with the the value stored as :
+
+.. code-block:: text
+
+   {'index_columns': ['__index_level_0__', '__index_level_1__', ...],
+    'columns': [<c0>, <c1>, ...],
+    'pandas_version': $VERSION}
+
+Here, ``<c0>`` and so forth are dictionaries containing the metadata for each
+column. This has JSON form:
+
+.. code-block:: text
+
+   {'name': column_name,
+    'pandas_type': pandas_type,
+    'numpy_dtype': numpy_type,
+    'metadata': type_metadata}
+
+``pandas_type`` is the logical type of the column, and is one of:
+
+* Boolean: ``'bool'``
+* Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'``
+* Floats: ``'float16', 'float32', 'float64'``
+* Datetime: ``'datetime', 'datetimetz'``
+* String: ``'unicode', 'bytes'``
+* Categorical: ``'categorical'``
+* Other Python objects: ``'object'``
+
+The ``numpy_type`` is the physical storage type of the column, which is the
+result of ``str(dtype)`` for the underlying NumPy array that holds the data. So
+for ``datetimetz`` this is ``datetime64[ns]`` and for categorical, it may be
+any of the supported integer categorical types.
+
+The ``type_metadata`` is ``None`` except for:
+
+* ``datetimetz``: ``{'timezone': zone}``, e.g. ``{'timezone', 'America/New_York'}``
+* ``categorical``: ``{'num_categories': K, 'ordered': is_ordered}``
+* ``object``: ``{'encoding': encoding}``
+
+Objects can be serialized and stored in ``BYTE_ARRAY`` Parquet columns. The
+encoding can be one of:
+
+* ``'pickle'``
+* ``'msgpack'``
+* ``'bson'``
+* ``'json'``
+
+For types other than these, the ``'metadata'`` key can be
+omitted. Implementations can assume ``None`` if the key is not present.
+
+As an example of fully-formed metadata:
+
+.. code-block:: text
+
+   {'index_columns': ['__index_level_0__'],
+    'columns': [
+        {'name': 'c0',
+         'pandas_type': 'int8',
+         'numpy_type': 'int8',
+         'metadata': None},
+        {'name': 'c1',
+         'pandas_type': 'bytes',
+         'numpy_type': 'object',
+         'metadata': None},
+        {'name': 'c2',
+         'pandas_type': 'categorical',
+         'numpy_type': 'int16',
+         'metadata': {'num_categories': 1000, 'ordered': False}},
+        {'name': 'c3',
+         'pandas_type': 'datetimetz',
+         'numpy_type': 'datetime64[ns]',
+         'metadata': {'timezone': 'America/Los_Angeles'}},
+        {'name': 'c4',
+         'pandas_type': 'object',
+         'numpy_type': 'object',
+         'metadata': {'encoding': 'pickle'}},
+        {'name': '__index_level_0__',
+         'pandas_type': 'int64',
+         'numpy_type': 'int64',
+         'metadata': None}
+    ],
+    'pandas_version': '0.20.0'}
diff --git a/doc/source/metadata.rst b/doc/source/metadata.rst
deleted file mode 100644
index eb865b71b404e..0000000000000
--- a/doc/source/metadata.rst
+++ /dev/null
@@ -1,100 +0,0 @@
-.. _metadata:
-
-.. currentmodule:: pandas
-
-**********************************************
-Storing pandas Objects in Various File Formats
-**********************************************
-
-This document provides specifications for metadata to assist with reading and
-writing pandas objects to different third party file formats.
-
-Apache Parquet
---------------
-
-The `Apache Parquet <https://github.com/apache/parquet-format>`__ format
-provides key-value metadata at the file and column level, stored in the footer
-of the Parquet file:
-
-.. code-block:: shell
-
-  5: optional list<KeyValue> key_value_metadata
-
-where ``KeyValue`` is
-
-.. code-block:: shell
-
-   struct KeyValue {
-     1: required string key
-     2: optional string value
-   }
-
-So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
-``pandas`` metadata key in the ``FileMetaData`` with the the value stored as :
-
-.. code-block:: text
-
-   {'index_columns': ['__index_level_0__', '__index_level_1__', ...],
-    'columns': [<c0>, <c1>, ...],
-    'pandas_version': $VERSION}
-
-Here, ``<c0>`` and so forth are dictionaries containing the metadata for each
-column. This has JSON form:
-
-.. code-block:: text
-
-   {'name': column_name,
-    'type': pandas_type,
-    'numpy_dtype': numpy_type,
-    'metadata': type_metadata}
-
-``pandas_type`` is the logical type of the column, and is one of:
-
-* Boolean: ``'bool'``
-* Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'``
-* Floats: ``'float16', 'float32', 'float64'``
-* Datetime: ``'datetime', 'datetimetz'``
-* String: ``'unicode', 'bytes'``
-* Categorical: ``'categorical'``
-
-The ``numpy_type`` is the physical storage type of the column, which is the
-result of ``str(dtype)`` for the underlying NumPy array that holds the data. So
-for ``datetimetz`` this is ``datetime64[ns]`` and for categorical, it may be
-any of the supported integer categorical types.
-
-The ``type_metadata`` is ``None`` except for:
-
-* ``datetimetz``: ``{'timezone': zone}``, e.g. ``{'timezone', 'America/New_York'}``
-* ``categorical``: ``{'num_categories': K, 'ordered': is_ordered}``
-
-For types other than these, the ``'metadata'`` key can be
-omitted. Implementations can assume ``None`` if the key is not present.
-
-As an example of fully-formed metadata:
-
-.. code-block:: text
-
-   {'index_columns': ['__index_level_0__'],
-    'columns': [
-        {'name': 'c0',
-         'type': 'int8',
-         'numpy_type': 'int8',
-         'metadata': None},
-        {'name': 'c1',
-         'type': 'bytes',
-         'numpy_type': 'object',
-         'metadata': None},
-        {'name': 'c2',
-         'type': 'categorical',
-         'numpy_type': 'int16',
-         'metadata': {'num_categories': 1000}},
-        {'name': 'c3',
-         'type': 'datetimetz',
-         'numpy_type': 'datetime64[ns]',
-         'metadata': {'timezone': 'America/Los_Angeles'}},
-        {'name': '__index_level_0__',
-         'type': 'int64',
-         'numpy_type': 'int64',
-         'metadata': None}
-    ],
-    'pandas_version': '0.20.0'}

From 67448be6b5f927f1f1336ae7474b5429325fc39d Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Mon, 15 May 2017 22:55:56 -0400
Subject: [PATCH 08/10] Code review comments

---
 doc/source/developer.rst      | 24 ++++++++++++++++--------
 doc/source/index.rst.template |  1 -
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/doc/source/developer.rst b/doc/source/developer.rst
index bc2e158b1b3b8..d11d9818e934c 100644
--- a/doc/source/developer.rst
+++ b/doc/source/developer.rst
@@ -17,6 +17,8 @@ Developer
 
 This section will focus on downstream applications of pandas.
 
+.. _apache.parquet:
+
 Storing pandas DataFrame objects in Apache Parquet format
 ---------------------------------------------------------
 
@@ -74,16 +76,22 @@ any of the supported integer categorical types.
 The ``type_metadata`` is ``None`` except for:
 
 * ``datetimetz``: ``{'timezone': zone}``, e.g. ``{'timezone', 'America/New_York'}``
-* ``categorical``: ``{'num_categories': K, 'ordered': is_ordered}``
-* ``object``: ``{'encoding': encoding}``
+* ``categorical``: ``{'num_categories': K, 'ordered': is_ordered, 'type': $TYPE}``
+
+  * Here ``'type'`` is optional, and can be a nested pandas type specification
+    here (but not categorical)
+
+* ``unicode``: ``{'encoding': encoding}``
+
+  * The encoding is optional, and if not present is UTF-8
 
-Objects can be serialized and stored in ``BYTE_ARRAY`` Parquet columns. The
-encoding can be one of:
+* ``object``: ``{'encoding': encoding}``. Objects can be serialized and stored
+  in ``BYTE_ARRAY`` Parquet columns. The encoding can be one of:
 
-* ``'pickle'``
-* ``'msgpack'``
-* ``'bson'``
-* ``'json'``
+  * ``'pickle'``
+  * ``'msgpack'``
+  * ``'bson'``
+  * ``'json'``
 
 For types other than these, the ``'metadata'`` key can be
 omitted. Implementations can assume ``None`` if the key is not present.
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
index 01a1d4f795c19..f5c65e175b0db 100644
--- a/doc/source/index.rst.template
+++ b/doc/source/index.rst.template
@@ -146,7 +146,6 @@ See the package overview for more detail about what's in the library.
     comparison_with_r
     comparison_with_sql
     comparison_with_sas
-    metadata
     {% endif -%}
     {% if api -%}
     api

From a2a42c053a77a4750fe31f4490e3ae7f361df635 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Tue, 16 May 2017 16:59:31 -0400
Subject: [PATCH 09/10] Review comments

---
 doc/source/developer.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/doc/source/developer.rst b/doc/source/developer.rst
index d11d9818e934c..f2dd4bb7e7e4f 100644
--- a/doc/source/developer.rst
+++ b/doc/source/developer.rst
@@ -63,7 +63,7 @@ column. This has JSON form:
 * Boolean: ``'bool'``
 * Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'``
 * Floats: ``'float16', 'float32', 'float64'``
-* Datetime: ``'datetime', 'datetimetz'``
+* Date and Time Types: ``'datetime', 'datetimetz'``, ``'timedelta'``
 * String: ``'unicode', 'bytes'``
 * Categorical: ``'categorical'``
 * Other Python objects: ``'object'``
@@ -75,7 +75,9 @@ any of the supported integer categorical types.
 
 The ``type_metadata`` is ``None`` except for:
 
-* ``datetimetz``: ``{'timezone': zone}``, e.g. ``{'timezone', 'America/New_York'}``
+* ``datetimetz``: ``{'timezone': zone, 'unit': 'ns'}``, e.g. ``{'timezone',
+  'America/New_York', 'unit': 'ns'}``. The ``'unit'`` is optional, and if
+  omitted it is assumed to be nanoseconds.
 * ``categorical``: ``{'num_categories': K, 'ordered': is_ordered, 'type': $TYPE}``
 
   * Here ``'type'`` is optional, and can be a nested pandas type specification
@@ -93,6 +95,9 @@ The ``type_metadata`` is ``None`` except for:
   * ``'bson'``
   * ``'json'``
 
+* ``timedelta``: ``{'unit': 'ns'}``. The ``'unit'`` is optional, and if omitted
+  it is assumed to be nanoseconds. This metadata is optional altogether
+
 For types other than these, the ``'metadata'`` key can be
 omitted. Implementations can assume ``None`` if the key is not present.
 

From 2d00f55545c8913cf791a620ca2e7db5925453c9 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes.mckinney@twosigma.com>
Date: Tue, 16 May 2017 18:25:56 -0400
Subject: [PATCH 10/10] Fix typo

---
 doc/source/developer.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/developer.rst b/doc/source/developer.rst
index f2dd4bb7e7e4f..78c12b7e23b37 100644
--- a/doc/source/developer.rst
+++ b/doc/source/developer.rst
@@ -55,7 +55,7 @@ column. This has JSON form:
 
    {'name': column_name,
     'pandas_type': pandas_type,
-    'numpy_dtype': numpy_type,
+    'numpy_type': numpy_type,
     'metadata': type_metadata}
 
 ``pandas_type`` is the logical type of the column, and is one of: