Skip to content

Commit 7e6a073

Browse files
committed
In gbq.to_gbq allow the DataFrame column order to differ from schema pandas-dev#11359
1 parent 54ab5be commit 7e6a073

File tree

4 files changed

+78
-9
lines changed

4 files changed

+78
-9
lines changed

doc/source/io.rst

+1-2
Original file line numberDiff line numberDiff line change
@@ -4579,8 +4579,7 @@ a ``TableCreationError`` if the destination table already exists.
45794579

45804580
If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
45814581
be written to the table using the defined table schema and column types. The
4582-
dataframe must match the destination table in column order, structure, and
4583-
data types.
4582+
dataframe must match the destination table in structure and data types.
45844583
If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
45854584
different schema, a delay of 2 minutes will be forced to ensure that the new schema
45864585
has propagated in the Google environment. See

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci
397397
Google BigQuery Enhancements
398398
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
399399
The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
400+
The :func:`pandas.io.gbq.to_gbq` method now allows the DataFrame column order to differ from the destination table schema (:issue:`11359`).
400401

401402
.. _whatsnew_0190.errstate:
402403

pandas/io/gbq.py

+11-7
Original file line numberDiff line numberDiff line change
@@ -547,12 +547,17 @@ def verify_schema(self, dataset_id, table_id, schema):
547547
from apiclient.errors import HttpError
548548

549549
try:
550-
return (self.service.tables().get(
550+
remote_schema = self.service.tables().get(
551551
projectId=self.project_id,
552552
datasetId=dataset_id,
553-
tableId=table_id
554-
).execute()['schema']) == schema
553+
tableId=table_id).execute()['schema']
555554

555+
fields_remote = set([json.dumps(field_remote)
556+
for field_remote in remote_schema['fields']])
557+
fields_local = set(json.dumps(field_local)
558+
for field_local in schema['fields'])
559+
560+
return fields_remote == fields_local
556561
except HttpError as ex:
557562
self.process_http_error(ex)
558563

@@ -819,10 +824,9 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
819824
dataset_id, table_id, table_schema)
820825
elif if_exists == 'append':
821826
if not connector.verify_schema(dataset_id, table_id, table_schema):
822-
raise InvalidSchema("Please verify that the column order, "
823-
"structure and data types in the "
824-
"DataFrame match the schema of the "
825-
"destination table.")
827+
raise InvalidSchema("Please verify that the structure and "
828+
"data types in the DataFrame match the "
829+
"schema of the destination table.")
826830
else:
827831
table.create(table_id, table_schema)
828832

pandas/io/tests/test_gbq.py

+65
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,8 @@ def setUp(self):
743743
private_key=_get_private_key_path())
744744
self.table = gbq._Table(_get_project_id(), DATASET_ID + "1",
745745
private_key=_get_private_key_path())
746+
self.sut = gbq.GbqConnector(_get_project_id(),
747+
private_key=_get_private_key_path())
746748

747749
@classmethod
748750
def tearDownClass(cls):
@@ -906,6 +908,69 @@ def test_list_table(self):
906908
'Expected table list to contain table {0}'
907909
.format(destination_table))
908910

911+
def test_verify_schema_allows_flexible_column_order(self):
912+
destination_table = TABLE_ID + "10"
913+
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
914+
{'name': 'B', 'type': 'FLOAT'},
915+
{'name': 'C', 'type': 'STRING'},
916+
{'name': 'D', 'type': 'TIMESTAMP'}]}
917+
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
918+
{'name': 'C', 'type': 'STRING'},
919+
{'name': 'B', 'type': 'FLOAT'},
920+
{'name': 'D', 'type': 'TIMESTAMP'}]}
921+
922+
self.table.create(destination_table, test_schema_1)
923+
self.assertTrue(self.sut.verify_schema(
924+
DATASET_ID + "1", destination_table, test_schema_2),
925+
'Expected schema to match')
926+
927+
def test_verify_schema_fails_different_data_type(self):
928+
destination_table = TABLE_ID + "11"
929+
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
930+
{'name': 'B', 'type': 'FLOAT'},
931+
{'name': 'C', 'type': 'STRING'},
932+
{'name': 'D', 'type': 'TIMESTAMP'}]}
933+
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
934+
{'name': 'B', 'type': 'STRING'},
935+
{'name': 'C', 'type': 'STRING'},
936+
{'name': 'D', 'type': 'TIMESTAMP'}]}
937+
938+
self.table.create(destination_table, test_schema_1)
939+
self.assertFalse(self.sut.verify_schema(
940+
DATASET_ID + "1", destination_table, test_schema_2),
941+
'Expected different schema')
942+
943+
def test_verify_schema_fails_different_structure(self):
944+
destination_table = TABLE_ID + "12"
945+
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
946+
{'name': 'B', 'type': 'FLOAT'},
947+
{'name': 'C', 'type': 'STRING'},
948+
{'name': 'D', 'type': 'TIMESTAMP'}]}
949+
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
950+
{'name': 'B2', 'type': 'FLOAT'},
951+
{'name': 'C', 'type': 'STRING'},
952+
{'name': 'D', 'type': 'TIMESTAMP'}]}
953+
954+
self.table.create(destination_table, test_schema_1)
955+
self.assertFalse(self.sut.verify_schema(
956+
DATASET_ID + "1", destination_table, test_schema_2),
957+
'Expected different schema')
958+
959+
def test_upload_data_flexible_column_order(self):
960+
destination_table = DESTINATION_TABLE + "13"
961+
962+
test_size = 10
963+
df = make_mixed_dataframe_v2(test_size)
964+
965+
# Initialize table with sample data
966+
gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
967+
private_key=_get_private_key_path())
968+
969+
df_columns_reversed = df[df.columns[::-1]]
970+
971+
gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(),
972+
if_exists='append', private_key=_get_private_key_path())
973+
909974
def test_list_dataset(self):
910975
dataset_id = DATASET_ID + "1"
911976
self.assertTrue(dataset_id in self.dataset.datasets(),

0 commit comments

Comments
 (0)