Skip to content

Commit eef1d33

Browse files
jorisvandenbosschewesm
authored andcommitted
alternative fix for duplicate index/column name that preserves index name if available
Change-Id: I68ca058b7d038a9f30d265aeaad192d0f86757cc
1 parent e327747 commit eef1d33

File tree

2 files changed

+34
-16
lines changed

2 files changed

+34
-16
lines changed

python/pyarrow/pandas_compat.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -179,10 +179,8 @@ def get_column_metadata(column, name, arrow_type, field_name):
179179
}
180180

181181

182-
index_level_name = '__index_level_{:d}__'.format
183-
184-
185-
def construct_metadata(df, column_names, index_levels, preserve_index, types):
182+
def construct_metadata(df, column_names, index_levels, index_column_names,
183+
preserve_index, types):
186184
"""Returns a dictionary containing enough metadata to reconstruct a pandas
187185
DataFrame as an Arrow Table, including index columns.
188186
@@ -197,9 +195,8 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types):
197195
-------
198196
dict
199197
"""
200-
ncolumns = len(column_names)
201-
df_types = types[:ncolumns - len(index_levels)]
202-
index_types = types[ncolumns - len(index_levels):]
198+
df_types = types[:-len(index_levels)]
199+
index_types = types[-len(index_levels):]
203200

204201
column_metadata = [
205202
get_column_metadata(
@@ -213,9 +210,6 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types):
213210
]
214211

215212
if preserve_index:
216-
index_column_names = list(map(
217-
index_level_name, range(len(index_levels))
218-
))
219213
index_column_metadata = [
220214
get_column_metadata(
221215
level,
@@ -294,9 +288,29 @@ def _column_name_to_strings(name):
294288
return str(name)
295289

296290

291+
def _index_level_name(index, i, column_names):
292+
"""Return the name of an index level or a default name if `index.name` is
293+
None or is already a column name.
294+
295+
Parameters
296+
----------
297+
index : pandas.Index
298+
i : int
299+
300+
Returns
301+
-------
302+
name : str
303+
"""
304+
if index.name is not None and index.name not in column_names:
305+
return index.name
306+
else:
307+
return '__index_level_{:d}__'.format(i)
308+
309+
297310
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
298-
names = []
311+
column_names = []
299312
index_columns = []
313+
index_column_names = []
300314
type = None
301315

302316
if preserve_index:
@@ -324,12 +338,13 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
324338

325339
columns_to_convert.append(col)
326340
convert_types.append(type)
327-
names.append(name)
341+
column_names.append(name)
328342

329343
for i, column in enumerate(index_columns):
330344
columns_to_convert.append(column)
331345
convert_types.append(None)
332-
names.append(index_level_name(i))
346+
name = _index_level_name(column, i, column_names)
347+
index_column_names.append(name)
333348

334349
# NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
335350
# using a thread pool is worth it. Currently the heuristic is whether the
@@ -358,8 +373,10 @@ def convert_column(col, ty):
358373
types = [x.type for x in arrays]
359374

360375
metadata = construct_metadata(
361-
df, names, index_columns, preserve_index, types
376+
df, column_names, index_columns, index_column_names, preserve_index,
377+
types
362378
)
379+
names = column_names + index_column_names
363380
return names, arrays, metadata
364381

365382

python/pyarrow/tests/test_convert_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,9 @@ def test_index_metadata_field_name(self):
191191
assert idx0['field_name'] == idx0_name
192192
assert idx0['name'] is None
193193

194-
assert foo_name == '__index_level_1__'
195-
assert foo['name'] == 'foo'
194+
assert foo_name == 'foo'
195+
assert foo['field_name'] == foo_name
196+
assert foo['name'] == foo_name
196197

197198
def test_categorical_column_index(self):
198199
df = pd.DataFrame(

0 commit comments

Comments
 (0)