Skip to content

Commit ec929ae

Browse files
committed
Add pandas_metadata attribute to pyarrow.Schema to make interactions simpler
1 parent 670dc6f commit ec929ae

File tree

4 files changed

+31
-32
lines changed

4 files changed

+31
-32
lines changed

python/pyarrow/pandas_compat.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -603,12 +603,9 @@ def table_to_blockmanager(options, table, categories=None,
603603
ignore_metadata=False):
604604
all_columns = []
605605
column_indexes = []
606-
metadata = table.schema.metadata
607-
has_pandas_metadata = (not ignore_metadata and metadata is not None
608-
and b'pandas' in metadata)
606+
pandas_metadata = table.schema.pandas_metadata
609607

610-
if has_pandas_metadata:
611-
pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
608+
if not ignore_metadata and pandas_metadata is not None:
612609
all_columns = pandas_metadata['columns']
613610
column_indexes = pandas_metadata.get('column_indexes', [])
614611
index_descriptors = pandas_metadata['index_columns']

python/pyarrow/tests/test_convert_pandas.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,7 @@ def test_index_metadata_field_name(self):
230230
columns=['a', None, '__index_level_0__'],
231231
)
232232
t = pa.Table.from_pandas(df, preserve_index=True)
233-
raw_metadata = t.schema.metadata
234-
235-
js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
233+
js = t.schema.pandas_metadata
236234

237235
col1, col2, col3, idx0, foo = js['columns']
238236

@@ -263,8 +261,7 @@ def test_categorical_column_index(self):
263261
columns=pd.Index(list('def'), dtype='category')
264262
)
265263
t = pa.Table.from_pandas(df, preserve_index=True)
266-
raw_metadata = t.schema.metadata
267-
js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
264+
js = t.schema.pandas_metadata
268265

269266
column_indexes, = js['column_indexes']
270267
assert column_indexes['name'] is None
@@ -281,8 +278,7 @@ def test_string_column_index(self):
281278
columns=pd.Index(list('def'), name='stringz')
282279
)
283280
t = pa.Table.from_pandas(df, preserve_index=True)
284-
raw_metadata = t.schema.metadata
285-
js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
281+
js = t.schema.pandas_metadata
286282

287283
column_indexes, = js['column_indexes']
288284
assert column_indexes['name'] == 'stringz'
@@ -308,8 +304,7 @@ def test_datetimetz_column_index(self):
308304
)
309305
)
310306
t = pa.Table.from_pandas(df, preserve_index=True)
311-
raw_metadata = t.schema.metadata
312-
js = json.loads(raw_metadata[b'pandas'].decode('utf8'))
307+
js = t.schema.pandas_metadata
313308

314309
column_indexes, = js['column_indexes']
315310
assert column_indexes['name'] is None
@@ -399,10 +394,8 @@ def test_multiindex_duplicate_values(self):
399394
def test_metadata_with_mixed_types(self):
400395
df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
401396
table = pa.Table.from_pandas(df)
402-
metadata = table.schema.metadata
403-
assert b'mixed' not in metadata[b'pandas']
404-
405-
js = json.loads(metadata[b'pandas'].decode('utf8'))
397+
js = table.schema.pandas_metadata
398+
assert 'mixed' not in js
406399
data_column = js['columns'][0]
407400
assert data_column['pandas_type'] == 'bytes'
408401
assert data_column['numpy_type'] == 'object'
@@ -422,10 +415,8 @@ def test_list_metadata(self):
422415
df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
423416
schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
424417
table = pa.Table.from_pandas(df, schema=schema)
425-
metadata = table.schema.metadata
426-
assert b'mixed' not in metadata[b'pandas']
427-
428-
js = json.loads(metadata[b'pandas'].decode('utf8'))
418+
js = table.schema.pandas_metadata
419+
assert 'mixed' not in js
429420
data_column = js['columns'][0]
430421
assert data_column['pandas_type'] == 'list[int64]'
431422
assert data_column['numpy_type'] == 'object'
@@ -438,10 +429,8 @@ def test_decimal_metadata(self):
438429
]
439430
})
440431
table = pa.Table.from_pandas(expected)
441-
metadata = table.schema.metadata
442-
assert b'mixed' not in metadata[b'pandas']
443-
444-
js = json.loads(metadata[b'pandas'].decode('utf8'))
432+
js = table.schema.pandas_metadata
433+
assert 'mixed' not in js
445434
data_column = js['columns'][0]
446435
assert data_column['pandas_type'] == 'decimal'
447436
assert data_column['numpy_type'] == 'object'
@@ -484,7 +473,7 @@ def test_empty_list_metadata(self):
484473
# type of empty lists
485474
df = tbl.to_pandas()
486475
tbl2 = pa.Table.from_pandas(df, preserve_index=True)
487-
md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8'))
476+
md2 = tbl2.schema.pandas_metadata
488477

489478
# Second roundtrip
490479
df2 = tbl2.to_pandas()

python/pyarrow/tests/test_parquet.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,12 @@ def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size):
153153

154154
filename = tempdir / 'pandas_rountrip.parquet'
155155
arrow_table = pa.Table.from_pandas(df)
156-
assert b'pandas' in arrow_table.schema.metadata
156+
assert arrow_table.schema.pandas_metadata is not None
157157

158158
_write_table(arrow_table, filename, version="2.0",
159159
coerce_timestamps='ms', chunk_size=chunk_size)
160160
table_read = pq.read_pandas(filename)
161-
assert b'pandas' in table_read.schema.metadata
161+
assert table_read.schema.pandas_metadata is not None
162162

163163
assert arrow_table.schema.metadata == table_read.schema.metadata
164164

@@ -295,7 +295,7 @@ def test_pandas_parquet_column_multiindex(tempdir):
295295

296296
filename = tempdir / 'pandas_rountrip.parquet'
297297
arrow_table = pa.Table.from_pandas(df)
298-
assert b'pandas' in arrow_table.schema.metadata
298+
assert arrow_table.schema.pandas_metadata is not None
299299

300300
_write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')
301301

@@ -309,7 +309,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir):
309309

310310
filename = tempdir / 'pandas_rountrip.parquet'
311311
arrow_table = pa.Table.from_pandas(df, preserve_index=False)
312-
js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8'))
312+
js = arrow_table.schema.pandas_metadata
313313
assert not js['index_columns']
314314
# ARROW-2170
315315
# While index_columns should be empty, columns needs to be filled still.
@@ -318,7 +318,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir):
318318
_write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')
319319
table_read = pq.read_pandas(filename)
320320

321-
js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8'))
321+
js = table_read.schema.pandas_metadata
322322
assert not js['index_columns']
323323

324324
assert arrow_table.schema.metadata == table_read.schema.metadata

python/pyarrow/types.pxi

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,19 @@ cdef class Schema:
628628
def __hash__(self):
629629
return hash((tuple(self), self.metadata))
630630

631+
@property
632+
def pandas_metadata(self):
633+
"""
634+
Return deserialized-from-JSON pandas metadata field (if it exists)
635+
"""
636+
metadata = self.metadata
637+
key = b'pandas'
638+
if metadata is None or key not in metadata:
639+
return None
640+
641+
import json
642+
return json.loads(metadata[key].decode('utf8'))
643+
631644
@property
632645
def names(self):
633646
"""

0 commit comments

Comments
 (0)