From cbd643e79c5af8cb12e41d19102fef8465ed39a7 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 9 Nov 2017 14:43:57 -0500 Subject: [PATCH 1/4] DOC: Update pyarrow format description around index levels --- doc/source/developer.rst | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/developer.rst b/doc/source/developer.rst index 9c214020ab43d..b073a21b6345c 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -50,7 +50,7 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a 'pandas_version': $VERSION} Here, ````/```` and so forth are dictionaries containing the metadata -for each column. This has JSON form: +for each column, *including the index columns*. This has JSON form: .. code-block:: text @@ -59,6 +59,26 @@ for each column. This has JSON form: 'numpy_type': numpy_type, 'metadata': metadata} +.. note:: + + The last ``N`` values of ``metadata['columns']``, where ``N = + len(metadata['index_columns'])``, contain information about the row indexes, + including the name of the index level and type information. + + Every index column is stored with a name matching the pattern + ``__index_level_\d+__`` and its corresponding column information is can be + found with the following code snippet. + + .. code-block:: python + + # assuming there's at least 3 levels in the index + index_columns = metadata['index_columns'] + columns = metadata['columns'] + ith_index = 2 + assert index_columns[ith_index] == '__index_level_2__' + ith_index_info = columns[-len(index_columns):][ith_index] + ith_index_level_name = ith_index_info['name'] + ``pandas_type`` is the logical type of the column, and is one of: * Boolean: ``'bool'`` From 43c55f1bba93900ddefba9ee8c36d4ecec77037b Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 7 Dec 2017 09:43:21 -0500 Subject: [PATCH 2/4] Updates --- doc/source/developer.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/developer.rst b/doc/source/developer.rst index b073a21b6345c..197efcda6ca5c 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -55,6 +55,7 @@ for each column, *including the index columns*. This has JSON form: .. code-block:: text {'name': column_name, + 'field_name': arrow_table_column_name, 'pandas_type': pandas_type, 'numpy_type': numpy_type, 'metadata': metadata} @@ -129,32 +130,39 @@ As an example of fully-formed metadata: {'index_columns': ['__index_level_0__'], 'column_indexes': [ {'name': None, - 'pandas_type': 'string', + 'field_name': None, + 'pandas_type': 'unicode', 'numpy_type': 'object', - 'metadata': None} + 'metadata': {'encoding': 'UTF-8'}} ], 'columns': [ {'name': 'c0', + 'field_name': 'c0', 'pandas_type': 'int8', 'numpy_type': 'int8', 'metadata': None}, {'name': 'c1', + 'field_name': 'c1', 'pandas_type': 'bytes', 'numpy_type': 'object', 'metadata': None}, {'name': 'c2', + 'field_name': 'c2', 'pandas_type': 'categorical', 'numpy_type': 'int16', 'metadata': {'num_categories': 1000, 'ordered': False}}, {'name': 'c3', + 'field_name': 'c3', 'pandas_type': 'datetimetz', 'numpy_type': 'datetime64[ns]', 'metadata': {'timezone': 'America/Los_Angeles'}}, {'name': 'c4', + 'field_name': 'c4', 'pandas_type': 'object', 'numpy_type': 'object', 'metadata': {'encoding': 'pickle'}}, {'name': '__index_level_0__', + 'field_name': None, 'pandas_type': 'int64', 'numpy_type': 'int64', 'metadata': None} From 967c49968e99a9092ca9daddde0cd162d1d783c0 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 7 Dec 2017 11:28:20 -0500 Subject: [PATCH 3/4] Review comments --- doc/source/developer.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/developer.rst b/doc/source/developer.rst index 197efcda6ca5c..03685105c6598 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -130,7 +130,7 @@ As an example of fully-formed metadata: {'index_columns': ['__index_level_0__'], 'column_indexes': [ {'name': None, - 'field_name': None, + 'field_name': 'None', 'pandas_type': 'unicode', 'numpy_type': 'object', 'metadata': {'encoding': 'UTF-8'}} @@ -161,8 +161,8 @@ As an example of fully-formed metadata: 'pandas_type': 'object', 'numpy_type': 'object', 'metadata': {'encoding': 'pickle'}}, - {'name': '__index_level_0__', - 'field_name': None, + {'name': None, + 'field_name': '__index_level_0__', 'pandas_type': 'int64', 'numpy_type': 'int64', 'metadata': None} From fd7ff76df1cf198431a6d9b8fe940038e982e293 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 7 Dec 2017 11:39:30 -0500 Subject: [PATCH 4/4] More review comments --- doc/source/developer.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/developer.rst b/doc/source/developer.rst index 03685105c6598..5b9cbb7ae799a 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -55,21 +55,22 @@ for each column, *including the index columns*. This has JSON form: .. code-block:: text {'name': column_name, - 'field_name': arrow_table_column_name, + 'field_name': parquet_column_name, 'pandas_type': pandas_type, 'numpy_type': numpy_type, 'metadata': metadata} .. note:: - The last ``N`` values of ``metadata['columns']``, where ``N = - len(metadata['index_columns'])``, contain information about the row indexes, - including the name of the index level and type information. - Every index column is stored with a name matching the pattern ``__index_level_\d+__`` and its corresponding column information is can be found with the following code snippet. + Following this naming convention isn't strictly necessary, but strongly + suggested for compatibility with Arrow. + + Here's an example of how the index metadata is structured in pyarrow: + .. code-block:: python # assuming there's at least 3 levels in the index