Skip to content

Commit 547812e

Browse files
JohnPatonmax-sixty
authored andcommitted
ENH: Allow partial table schema in to_gbq() table_schema (#218) (#257)
* ENH: Allow partial table schema in to_gbq * CLN: applied black * BUG: make update_schema python 2.7 compatible * DOC: update docs to allow for a subset of columns in to_gbq table_schema * DOC: what's new * DOC: close parens around issue in changelog
1 parent d06db4b commit 547812e

File tree

4 files changed

+97
-6
lines changed

4 files changed

+97
-6
lines changed

docs/source/changelog.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ Internal changes
2020
- Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()``
2121
function. (:issue:`247`)
2222

23+
Enhancements
24+
~~~~~~~~~~~~
25+
- Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns,
26+
with the rest being populated using the DataFrame dtypes (:issue:`218`)
27+
(contributed by @johnpaton)
2328

2429
.. _changelog-0.9.0:
2530

@@ -237,4 +242,4 @@ Initial release of transfered code from `pandas <https://github.com/pandas-dev/p
237242
Includes patches since the 0.19.2 release on pandas with the following:
238243

239244
- :func:`read_gbq` now allows query configuration preferences `pandas-GH#14742 <https://github.com/pandas-dev/pandas/pull/14742>`__
240-
- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 <https://github.com/pandas-dev/pandas/pull/14064>`__, and `pandas-GH#14305 <https://github.com/pandas-dev/pandas/pull/14305>`__
245+
- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 <https://github.com/pandas-dev/pandas/pull/14064>`__, and `pandas-GH#14305 <https://github.com/pandas-dev/pandas/pull/14305>`__

pandas_gbq/gbq.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -939,9 +939,11 @@ def to_gbq(
939939
'STRING'},...]``.
940940
If schema is not provided, it will be
941941
generated according to dtypes of DataFrame columns.
942-
If schema is provided, it must contain all DataFrame columns.
943-
pandas_gbq.gbq._generate_bq_schema() may be used to create an initial
944-
schema, though it doesn't preserve column order.
942+
If schema is provided, it may contain all or a subset of DataFrame
943+
columns. If a subset is provided, the rest will be inferred from
944+
the DataFrame dtypes.
945+
pandas_gbq.gbq._generate_bq_schema() may be used to create an
946+
initial schema, though it doesn't preserve column order.
945947
See BigQuery API documentation on available names of a field.
946948
947949
.. versionadded:: 0.3.1
@@ -1023,10 +1025,13 @@ def to_gbq(
10231025
credentials=connector.credentials,
10241026
)
10251027

1028+
default_schema = _generate_bq_schema(dataframe)
10261029
if not table_schema:
1027-
table_schema = _generate_bq_schema(dataframe)
1030+
table_schema = default_schema
10281031
else:
1029-
table_schema = dict(fields=table_schema)
1032+
table_schema = _update_bq_schema(
1033+
default_schema, dict(fields=table_schema)
1034+
)
10301035

10311036
# If table exists, check if_exists parameter
10321037
if table.exists(table_id):
@@ -1091,6 +1096,12 @@ def _generate_bq_schema(df, default_type="STRING"):
10911096
return schema.generate_bq_schema(df, default_type=default_type)
10921097

10931098

1099+
def _update_bq_schema(schema_old, schema_new):
1100+
from pandas_gbq import schema
1101+
1102+
return schema.update_schema(schema_old, schema_new)
1103+
1104+
10941105
class _Table(GbqConnector):
10951106
def __init__(
10961107
self,

pandas_gbq/schema.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,32 @@ def generate_bq_schema(dataframe, default_type="STRING"):
3131
)
3232

3333
return {"fields": fields}
34+
35+
36+
def update_schema(schema_old, schema_new):
37+
"""
38+
Given an old BigQuery schema, update it with a new one.
39+
40+
Where a field name is the same, the new will replace the old. Any
41+
new fields not present in the old schema will be added.
42+
43+
Arguments:
44+
schema_old: the old schema to update
45+
schema_new: the new schema which will overwrite/extend the old
46+
"""
47+
old_fields = schema_old["fields"]
48+
new_fields = schema_new["fields"]
49+
output_fields = list(old_fields)
50+
51+
field_indices = {field["name"]: i for i, field in enumerate(output_fields)}
52+
53+
for field in new_fields:
54+
name = field["name"]
55+
if name in field_indices:
56+
# replace old field with new field of same name
57+
output_fields[field_indices[name]] = field
58+
else:
59+
# add new field
60+
output_fields.append(field)
61+
62+
return {"fields": output_fields}

tests/unit/test_schema.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,49 @@
5454
def test_generate_bq_schema(dataframe, expected_schema):
5555
schema = pandas_gbq.schema.generate_bq_schema(dataframe)
5656
assert schema == expected_schema
57+
58+
59+
@pytest.mark.parametrize(
60+
"schema_old,schema_new,expected_output",
61+
[
62+
(
63+
{"fields": [{"name": "col1", "type": "INTEGER"}]},
64+
{"fields": [{"name": "col2", "type": "TIMESTAMP"}]},
65+
{
66+
"fields": [
67+
{"name": "col1", "type": "INTEGER"},
68+
{"name": "col2", "type": "TIMESTAMP"},
69+
]
70+
},
71+
),
72+
(
73+
{"fields": [{"name": "col1", "type": "INTEGER"}]},
74+
{"fields": [{"name": "col1", "type": "BOOLEAN"}]},
75+
{"fields": [{"name": "col1", "type": "BOOLEAN"}]},
76+
),
77+
(
78+
{
79+
"fields": [
80+
{"name": "col1", "type": "INTEGER"},
81+
{"name": "col2", "type": "INTEGER"},
82+
]
83+
},
84+
{
85+
"fields": [
86+
{"name": "col2", "type": "BOOLEAN"},
87+
{"name": "col3", "type": "FLOAT"},
88+
]
89+
},
90+
{
91+
"fields": [
92+
{"name": "col1", "type": "INTEGER"},
93+
{"name": "col2", "type": "BOOLEAN"},
94+
{"name": "col3", "type": "FLOAT"},
95+
]
96+
},
97+
),
98+
],
99+
)
100+
def test_update_schema(schema_old, schema_new, expected_output):
101+
output = pandas_gbq.schema.update_schema(schema_old, schema_new)
102+
assert output == expected_output

0 commit comments

Comments
 (0)