Add pandas_metadata attribute to pyarrow.Schema to make interactions simpler

wesm · wesm · commit ec929aebd506 · 2019-03-12T16:56:40.000-05:00
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -603,12 +603,9 @@ def table_to_blockmanager(options, table, categories=None,
                           ignore_metadata=False):
     all_columns = []
     column_indexes = []
-    metadata = table.schema.metadata
-    has_pandas_metadata = (not ignore_metadata and metadata is not None
-                           and b'pandas' in metadata)
+    pandas_metadata = table.schema.pandas_metadata
 
-    if has_pandas_metadata:
-        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
+    if not ignore_metadata and pandas_metadata is not None:
         all_columns = pandas_metadata['columns']
         column_indexes = pandas_metadata.get('column_indexes', [])
         index_descriptors = pandas_metadata['index_columns']
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
@@ -230,9 +230,7 @@ def test_index_metadata_field_name(self):
             columns=['a', None, '__index_level_0__'],
         )
         t = pa.Table.from_pandas(df, preserve_index=True)
-        raw_metadata = t.schema.metadata
-
-        js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+        js = t.schema.pandas_metadata
 
         col1, col2, col3, idx0, foo = js['columns']
 
@@ -263,8 +261,7 @@ def test_categorical_column_index(self):
             columns=pd.Index(list('def'), dtype='category')
         )
         t = pa.Table.from_pandas(df, preserve_index=True)
-        raw_metadata = t.schema.metadata
-        js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+        js = t.schema.pandas_metadata
 
         column_indexes, = js['column_indexes']
         assert column_indexes['name'] is None
@@ -281,8 +278,7 @@ def test_string_column_index(self):
             columns=pd.Index(list('def'), name='stringz')
         )
         t = pa.Table.from_pandas(df, preserve_index=True)
-        raw_metadata = t.schema.metadata
-        js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+        js = t.schema.pandas_metadata
 
         column_indexes, = js['column_indexes']
         assert column_indexes['name'] == 'stringz'
@@ -308,8 +304,7 @@ def test_datetimetz_column_index(self):
             )
         )
         t = pa.Table.from_pandas(df, preserve_index=True)
-        raw_metadata = t.schema.metadata
-        js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
+        js = t.schema.pandas_metadata
 
         column_indexes, = js['column_indexes']
         assert column_indexes['name'] is None
@@ -399,10 +394,8 @@ def test_multiindex_duplicate_values(self):
     def test_metadata_with_mixed_types(self):
         df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
         table = pa.Table.from_pandas(df)
-        metadata = table.schema.metadata
-        assert b'mixed' not in metadata[b'pandas']
-
-        js = json.loads(metadata[b'pandas'].decode('utf8'))
+        js = table.schema.pandas_metadata
+        assert 'mixed' not in js
         data_column = js['columns'][0]
         assert data_column['pandas_type'] == 'bytes'
         assert data_column['numpy_type'] == 'object'
@@ -422,10 +415,8 @@ def test_list_metadata(self):
         df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
         schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
         table = pa.Table.from_pandas(df, schema=schema)
-        metadata = table.schema.metadata
-        assert b'mixed' not in metadata[b'pandas']
-
-        js = json.loads(metadata[b'pandas'].decode('utf8'))
+        js = table.schema.pandas_metadata
+        assert 'mixed' not in js
         data_column = js['columns'][0]
         assert data_column['pandas_type'] == 'list[int64]'
         assert data_column['numpy_type'] == 'object'
@@ -438,10 +429,8 @@ def test_decimal_metadata(self):
             ]
         })
         table = pa.Table.from_pandas(expected)
-        metadata = table.schema.metadata
-        assert b'mixed' not in metadata[b'pandas']
-
-        js = json.loads(metadata[b'pandas'].decode('utf8'))
+        js = table.schema.pandas_metadata
+        assert 'mixed' not in js
         data_column = js['columns'][0]
         assert data_column['pandas_type'] == 'decimal'
         assert data_column['numpy_type'] == 'object'
@@ -484,7 +473,7 @@ def test_empty_list_metadata(self):
         # type of empty lists
         df = tbl.to_pandas()
         tbl2 = pa.Table.from_pandas(df, preserve_index=True)
-        md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8'))
+        md2 = tbl2.schema.pandas_metadata
 
         # Second roundtrip
         df2 = tbl2.to_pandas()
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
@@ -153,12 +153,12 @@ def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size):
 
     filename = tempdir / 'pandas_rountrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
-    assert b'pandas' in arrow_table.schema.metadata
+    assert arrow_table.schema.pandas_metadata is not None
 
     _write_table(arrow_table, filename, version="2.0",
                  coerce_timestamps='ms', chunk_size=chunk_size)
     table_read = pq.read_pandas(filename)
-    assert b'pandas' in table_read.schema.metadata
+    assert table_read.schema.pandas_metadata is not None
 
     assert arrow_table.schema.metadata == table_read.schema.metadata
 
@@ -295,7 +295,7 @@ def test_pandas_parquet_column_multiindex(tempdir):
 
     filename = tempdir / 'pandas_rountrip.parquet'
     arrow_table = pa.Table.from_pandas(df)
-    assert b'pandas' in arrow_table.schema.metadata
+    assert arrow_table.schema.pandas_metadata is not None
 
     _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')
 
@@ -309,7 +309,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir):
 
     filename = tempdir / 'pandas_rountrip.parquet'
     arrow_table = pa.Table.from_pandas(df, preserve_index=False)
-    js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8'))
+    js = arrow_table.schema.pandas_metadata
     assert not js['index_columns']
     # ARROW-2170
     # While index_columns should be empty, columns needs to be filled still.
@@ -318,7 +318,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir):
     _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')
     table_read = pq.read_pandas(filename)
 
-    js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8'))
+    js = table_read.schema.pandas_metadata
     assert not js['index_columns']
 
     assert arrow_table.schema.metadata == table_read.schema.metadata
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
@@ -628,6 +628,19 @@ cdef class Schema:
     def __hash__(self):
         return hash((tuple(self), self.metadata))
 
+    @property
+    def pandas_metadata(self):
+        """
+        Return deserialized-from-JSON pandas metadata field (if it exists)
+        """
+        metadata = self.metadata
+        key = b'pandas'
+        if metadata is None or key not in metadata:
+            return None
+
+        import json
+        return json.loads(metadata[key].decode('utf8'))
+
     @property
     def names(self):
         """