Skip to content

Commit 670dc6f

Browse files
committed
Add compatibility tests for pre-0.13 metadata. Add Arrow version to pandas metadata
1 parent 0ca1bfc commit 670dc6f

File tree

2 files changed

+177
-0
lines changed

2 files changed

+177
-0
lines changed

python/pyarrow/pandas_compat.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,10 @@ def construct_metadata(df, column_names, index_levels, index_descriptors,
245245
'index_columns': index_descriptors,
246246
'column_indexes': column_indexes,
247247
'columns': column_metadata + index_column_metadata,
248+
'creator': {
249+
'library': 'pyarrow',
250+
'version': pa.__version__
251+
},
248252
'pandas_version': pd.__version__
249253
}).encode('utf8')
250254
}

python/pyarrow/tests/test_convert_pandas.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2539,3 +2539,176 @@ def test_table_from_pandas_columns_and_schema_are_mutually_exclusive():
25392539

25402540
with pytest.raises(ValueError):
25412541
pa.Table.from_pandas(df, schema=schema, columns=columns)
2542+
2543+
2544+
# ----------------------------------------------------------------------
2545+
# Legacy metadata compatibility tests
2546+
2547+
2548+
def test_range_index_pre_0_12():
2549+
# Forward compatibility for metadata created from pandas.RangeIndex
2550+
# prior to pyarrow 0.13.0
2551+
a_values = [u'foo', u'bar', None, u'baz']
2552+
b_values = [u'a', u'a', u'b', u'b']
2553+
a_arrow = pa.array(a_values, type='utf8')
2554+
b_arrow = pa.array(b_values, type='utf8')
2555+
2556+
rng_index_arrow = pa.array([0, 2, 4, 6], type='int64')
2557+
2558+
gen_name_0 = '__index_level_0__'
2559+
gen_name_1 = '__index_level_1__'
2560+
2561+
# Case 1: named RangeIndex
2562+
e1 = pd.DataFrame({
2563+
'a': a_values
2564+
}, index=pd.RangeIndex(0, 8, step=2, name='qux'))
2565+
t1 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
2566+
names=['a', 'qux'])
2567+
t1 = t1.replace_schema_metadata({
2568+
b'pandas': json.dumps(
2569+
{'index_columns': ['qux'],
2570+
'column_indexes': [{'name': None,
2571+
'field_name': None,
2572+
'pandas_type': 'unicode',
2573+
'numpy_type': 'object',
2574+
'metadata': {'encoding': 'UTF-8'}}],
2575+
'columns': [{'name': 'a',
2576+
'field_name': 'a',
2577+
'pandas_type': 'unicode',
2578+
'numpy_type': 'object',
2579+
'metadata': None},
2580+
{'name': 'qux',
2581+
'field_name': 'qux',
2582+
'pandas_type': 'int64',
2583+
'numpy_type': 'int64',
2584+
'metadata': None}],
2585+
'pandas_version': '0.23.4'}
2586+
)})
2587+
r1 = t1.to_pandas()
2588+
tm.assert_frame_equal(r1, e1)
2589+
2590+
# Case 2: named RangeIndex, but conflicts with an actual column
2591+
e2 = pd.DataFrame({
2592+
'qux': a_values
2593+
}, index=pd.RangeIndex(0, 8, step=2, name='qux'))
2594+
t2 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
2595+
names=['qux', gen_name_0])
2596+
t2 = t2.replace_schema_metadata({
2597+
b'pandas': json.dumps(
2598+
{'index_columns': [gen_name_0],
2599+
'column_indexes': [{'name': None,
2600+
'field_name': None,
2601+
'pandas_type': 'unicode',
2602+
'numpy_type': 'object',
2603+
'metadata': {'encoding': 'UTF-8'}}],
2604+
'columns': [{'name': 'a',
2605+
'field_name': 'a',
2606+
'pandas_type': 'unicode',
2607+
'numpy_type': 'object',
2608+
'metadata': None},
2609+
{'name': 'qux',
2610+
'field_name': gen_name_0,
2611+
'pandas_type': 'int64',
2612+
'numpy_type': 'int64',
2613+
'metadata': None}],
2614+
'pandas_version': '0.23.4'}
2615+
)})
2616+
r2 = t2.to_pandas()
2617+
tm.assert_frame_equal(r2, e2)
2618+
2619+
# Case 3: unnamed RangeIndex
2620+
e3 = pd.DataFrame({
2621+
'a': a_values
2622+
}, index=pd.RangeIndex(0, 8, step=2, name=None))
2623+
t3 = pa.Table.from_arrays([a_arrow, rng_index_arrow],
2624+
names=['a', gen_name_0])
2625+
t3 = t3.replace_schema_metadata({
2626+
b'pandas': json.dumps(
2627+
{'index_columns': [gen_name_0],
2628+
'column_indexes': [{'name': None,
2629+
'field_name': None,
2630+
'pandas_type': 'unicode',
2631+
'numpy_type': 'object',
2632+
'metadata': {'encoding': 'UTF-8'}}],
2633+
'columns': [{'name': 'a',
2634+
'field_name': 'a',
2635+
'pandas_type': 'unicode',
2636+
'numpy_type': 'object',
2637+
'metadata': None},
2638+
{'name': None,
2639+
'field_name': gen_name_0,
2640+
'pandas_type': 'int64',
2641+
'numpy_type': 'int64',
2642+
'metadata': None}],
2643+
'pandas_version': '0.23.4'}
2644+
)})
2645+
r3 = t3.to_pandas()
2646+
tm.assert_frame_equal(r3, e3)
2647+
2648+
# Case 4: MultiIndex with named RangeIndex
2649+
e4 = pd.DataFrame({
2650+
'a': a_values
2651+
}, index=[pd.RangeIndex(0, 8, step=2, name='qux'), b_values])
2652+
t4 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
2653+
names=['a', 'qux', gen_name_1])
2654+
t4 = t4.replace_schema_metadata({
2655+
b'pandas': json.dumps(
2656+
{'index_columns': ['qux', gen_name_1],
2657+
'column_indexes': [{'name': None,
2658+
'field_name': None,
2659+
'pandas_type': 'unicode',
2660+
'numpy_type': 'object',
2661+
'metadata': {'encoding': 'UTF-8'}}],
2662+
'columns': [{'name': 'a',
2663+
'field_name': 'a',
2664+
'pandas_type': 'unicode',
2665+
'numpy_type': 'object',
2666+
'metadata': None},
2667+
{'name': 'qux',
2668+
'field_name': 'qux',
2669+
'pandas_type': 'int64',
2670+
'numpy_type': 'int64',
2671+
'metadata': None},
2672+
{'name': None,
2673+
'field_name': gen_name_1,
2674+
'pandas_type': 'unicode',
2675+
'numpy_type': 'object',
2676+
'metadata': None}],
2677+
'pandas_version': '0.23.4'}
2678+
)})
2679+
r4 = t4.to_pandas()
2680+
tm.assert_frame_equal(r4, e4)
2681+
2682+
# Case 4: MultiIndex with unnamed RangeIndex
2683+
e5 = pd.DataFrame({
2684+
'a': a_values
2685+
}, index=[pd.RangeIndex(0, 8, step=2, name=None), b_values])
2686+
t5 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow],
2687+
names=['a', gen_name_0, gen_name_1])
2688+
t5 = t5.replace_schema_metadata({
2689+
b'pandas': json.dumps(
2690+
{'index_columns': [gen_name_0, gen_name_1],
2691+
'column_indexes': [{'name': None,
2692+
'field_name': None,
2693+
'pandas_type': 'unicode',
2694+
'numpy_type': 'object',
2695+
'metadata': {'encoding': 'UTF-8'}}],
2696+
'columns': [{'name': 'a',
2697+
'field_name': 'a',
2698+
'pandas_type': 'unicode',
2699+
'numpy_type': 'object',
2700+
'metadata': None},
2701+
{'name': None,
2702+
'field_name': gen_name_0,
2703+
'pandas_type': 'int64',
2704+
'numpy_type': 'int64',
2705+
'metadata': None},
2706+
{'name': None,
2707+
'field_name': gen_name_1,
2708+
'pandas_type': 'unicode',
2709+
'numpy_type': 'object',
2710+
'metadata': None}],
2711+
'pandas_version': '0.23.4'}
2712+
)})
2713+
r5 = t5.to_pandas()
2714+
tm.assert_frame_equal(r5, e5)

0 commit comments

Comments
 (0)