@@ -173,6 +173,7 @@ def _upsert(
173
173
temp_table : str ,
174
174
schema : str ,
175
175
primary_keys : Optional [List [str ]] = None ,
176
+ precombine_key : Optional [str ] = None ,
176
177
) -> None :
177
178
if not primary_keys :
178
179
primary_keys = _get_primary_keys (cursor = cursor , schema = schema , table = table )
@@ -181,12 +182,26 @@ def _upsert(
181
182
raise exceptions .InvalidRedshiftPrimaryKeys ()
182
183
equals_clause : str = f"{ table } .%s = { temp_table } .%s"
183
184
join_clause : str = " AND " .join ([equals_clause % (pk , pk ) for pk in primary_keys ])
184
- sql : str = f'DELETE FROM "{ schema } "."{ table } " USING { temp_table } WHERE { join_clause } '
185
- _logger .debug (sql )
186
- cursor .execute (sql )
187
- sql = f"INSERT INTO { schema } .{ table } SELECT * FROM { temp_table } "
188
- _logger .debug (sql )
189
- cursor .execute (sql )
185
+ if precombine_key :
186
+ delete_from_target_filter : str = f"AND { table } .{ precombine_key } <= { temp_table } .{ precombine_key } "
187
+ delete_from_temp_filter : str = f"AND { table } .{ precombine_key } > { temp_table } .{ precombine_key } "
188
+ target_del_sql : str = (
189
+ f'DELETE FROM "{ schema } "."{ table } " USING { temp_table } WHERE { join_clause } { delete_from_target_filter } '
190
+ )
191
+ _logger .debug (target_del_sql )
192
+ cursor .execute (target_del_sql )
193
+ source_del_sql : str = (
194
+ f'DELETE FROM { temp_table } USING "{ schema } "."{ table } " WHERE { join_clause } { delete_from_temp_filter } '
195
+ )
196
+ _logger .debug (source_del_sql )
197
+ cursor .execute (source_del_sql )
198
+ else :
199
+ sql : str = f'DELETE FROM "{ schema } "."{ table } " USING { temp_table } WHERE { join_clause } '
200
+ _logger .debug (sql )
201
+ cursor .execute (sql )
202
+ insert_sql = f"INSERT INTO { schema } .{ table } SELECT * FROM { temp_table } "
203
+ _logger .debug (insert_sql )
204
+ cursor .execute (insert_sql )
190
205
_drop_table (cursor = cursor , schema = schema , table = temp_table )
191
206
192
207
@@ -424,29 +439,29 @@ def connect(
424
439
----------
425
440
connection : Optional[str]
426
441
Glue Catalog Connection name.
427
- secret_id: Optional[str]:
442
+ secret_id : Optional[str]:
428
443
Specifies the secret containing the connection details that you want to retrieve.
429
444
You can specify either the Amazon Resource Name (ARN) or the friendly name of the secret.
430
445
catalog_id : str, optional
431
446
The ID of the Data Catalog.
432
447
If none is provided, the AWS account ID is used by default.
433
- dbname: Optional[str]
448
+ dbname : Optional[str]
434
449
Optional database name to overwrite the stored one.
435
450
boto3_session : boto3.Session(), optional
436
451
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
437
- ssl: bool
452
+ ssl : bool
438
453
This governs SSL encryption for TCP/IP sockets.
439
454
This parameter is forward to redshift_connector.
440
455
https://github.com/aws/amazon-redshift-python-driver
441
- timeout: Optional[int]
456
+ timeout : Optional[int]
442
457
This is the time in seconds before the connection to the server will time out.
443
458
The default is None which means no timeout.
444
459
This parameter is forward to redshift_connector.
445
460
https://github.com/aws/amazon-redshift-python-driver
446
- max_prepared_statements: int
461
+ max_prepared_statements : int
447
462
This parameter is forward to redshift_connector.
448
463
https://github.com/aws/amazon-redshift-python-driver
449
- tcp_keepalive: bool
464
+ tcp_keepalive : bool
450
465
If True then use TCP keepalive. The default is True.
451
466
This parameter is forward to redshift_connector.
452
467
https://github.com/aws/amazon-redshift-python-driver
@@ -534,19 +549,19 @@ def connect_temp(
534
549
in addition to any group memberships for an existing user. If not specified, a new user is added only to PUBLIC.
535
550
boto3_session : boto3.Session(), optional
536
551
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
537
- ssl: bool
552
+ ssl : bool
538
553
This governs SSL encryption for TCP/IP sockets.
539
554
This parameter is forward to redshift_connector.
540
555
https://github.com/aws/amazon-redshift-python-driver
541
- timeout: Optional[int]
556
+ timeout : Optional[int]
542
557
This is the time in seconds before the connection to the server will time out.
543
558
The default is None which means no timeout.
544
559
This parameter is forward to redshift_connector.
545
560
https://github.com/aws/amazon-redshift-python-driver
546
- max_prepared_statements: int
561
+ max_prepared_statements : int
547
562
This parameter is forward to redshift_connector.
548
563
https://github.com/aws/amazon-redshift-python-driver
549
- tcp_keepalive: bool
564
+ tcp_keepalive : bool
550
565
If True then use TCP keepalive. The default is True.
551
566
This parameter is forward to redshift_connector.
552
567
https://github.com/aws/amazon-redshift-python-driver
@@ -697,7 +712,7 @@ def read_sql_table(
697
712
List of parameters to pass to execute method.
698
713
The syntax used to pass parameters is database driver dependent.
699
714
Check your database driver documentation for which of the five syntax styles,
700
- described in PEP 249’ s paramstyle, is supported.
715
+ described in PEP 249' s paramstyle, is supported.
701
716
chunksize : int, optional
702
717
If specified, return an iterator where chunksize is the number of rows to include in each chunk.
703
718
dtype : Dict[str, pyarrow.DataType], optional
@@ -761,6 +776,7 @@ def to_sql( # pylint: disable=too-many-locals
761
776
lock : bool = False ,
762
777
chunksize : int = 200 ,
763
778
commit_transaction : bool = True ,
779
+ precombine_key : Optional [str ] = None ,
764
780
) -> None :
765
781
"""Write records stored in a DataFrame into Redshift.
766
782
@@ -793,7 +809,7 @@ def to_sql( # pylint: disable=too-many-locals
793
809
index : bool
794
810
True to store the DataFrame index as a column in the table,
795
811
otherwise False to ignore it.
796
- dtype: Dict[str, str], optional
812
+ dtype : Dict[str, str], optional
797
813
Dictionary of columns names and Redshift types to be casted.
798
814
Useful when you have columns with undetermined or mixed data types.
799
815
(e.g. {'col name': 'VARCHAR(10)', 'col2 name': 'FLOAT'})
@@ -819,10 +835,14 @@ def to_sql( # pylint: disable=too-many-locals
819
835
inserted into the database columns `col1` and `col3`.
820
836
lock : bool
821
837
True to execute LOCK command inside the transaction to force serializable isolation.
822
- chunksize: int
838
+ chunksize : int
823
839
Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query.
824
- commit_transaction: bool
840
+ commit_transaction : bool
825
841
Whether to commit the transaction. True by default.
842
+ precombine_key : str, optional
843
+ When there is a primary_key match during upsert, this column will change the upsert method,
844
+ comparing the values of the specified column from source and target, and keeping the
845
+ larger of the two. Will only work when mode = upsert.
826
846
827
847
Returns
828
848
-------
@@ -887,7 +907,14 @@ def to_sql( # pylint: disable=too-many-locals
887
907
if table != created_table : # upsert
888
908
if lock :
889
909
_lock (cursor , [table ], schema = schema )
890
- _upsert (cursor = cursor , schema = schema , table = table , temp_table = created_table , primary_keys = primary_keys )
910
+ _upsert (
911
+ cursor = cursor ,
912
+ schema = schema ,
913
+ table = table ,
914
+ temp_table = created_table ,
915
+ primary_keys = primary_keys ,
916
+ precombine_key = precombine_key ,
917
+ )
891
918
if commit_transaction :
892
919
con .commit ()
893
920
except Exception as ex :
@@ -1071,7 +1098,7 @@ def unload(
1071
1098
1072
1099
Parameters
1073
1100
----------
1074
- sql: str
1101
+ sql : str
1075
1102
SQL query.
1076
1103
path : Union[str, List[str]]
1077
1104
S3 path to write stage files (e.g. s3://bucket_name/any_name/)
@@ -1114,7 +1141,7 @@ def unload(
1114
1141
If integer is provided, specified number is used.
1115
1142
boto3_session : boto3.Session(), optional
1116
1143
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
1117
- s3_additional_kwargs:
1144
+ s3_additional_kwargs : Dict[str, str], optional
1118
1145
Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered.
1119
1146
1120
1147
Returns
@@ -1206,6 +1233,7 @@ def copy_from_files( # pylint: disable=too-many-locals,too-many-arguments
1206
1233
sql_copy_extra_params : Optional [List [str ]] = None ,
1207
1234
boto3_session : Optional [boto3 .Session ] = None ,
1208
1235
s3_additional_kwargs : Optional [Dict [str , str ]] = None ,
1236
+ precombine_key : Optional [str ] = None ,
1209
1237
) -> None :
1210
1238
"""Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command).
1211
1239
@@ -1277,12 +1305,12 @@ def copy_from_files( # pylint: disable=too-many-locals,too-many-arguments
1277
1305
Should Wrangler add SERIALIZETOJSON parameter into the COPY command?
1278
1306
SERIALIZETOJSON is necessary to load nested data
1279
1307
https://docs.aws.amazon.com/redshift/latest/dg/ingest-super.html#copy_json
1280
- path_suffix: Union[str, List[str], None]
1308
+ path_suffix : Union[str, List[str], None]
1281
1309
Suffix or List of suffixes to be scanned on s3 for the schema extraction
1282
1310
(e.g. [".gz.parquet", ".snappy.parquet"]).
1283
1311
Only has effect during the table creation.
1284
1312
If None, will try to read all files. (default)
1285
- path_ignore_suffix: Union[str, List[str], None]
1313
+ path_ignore_suffix : Union[str, List[str], None]
1286
1314
Suffix or List of suffixes for S3 keys to be ignored during the schema extraction.
1287
1315
(e.g. [".csv", "_SUCCESS"]).
1288
1316
Only has effect during the table creation.
@@ -1293,17 +1321,21 @@ def copy_from_files( # pylint: disable=too-many-locals,too-many-arguments
1293
1321
If integer is provided, specified number is used.
1294
1322
lock : bool
1295
1323
True to execute LOCK command inside the transaction to force serializable isolation.
1296
- commit_transaction: bool
1324
+ commit_transaction : bool
1297
1325
Whether to commit the transaction. True by default.
1298
- manifest: bool
1326
+ manifest : bool
1299
1327
If set to true path argument accepts a S3 uri to a manifest file.
1300
- sql_copy_extra_params: Optional[List[str]]
1328
+ sql_copy_extra_params : Optional[List[str]]
1301
1329
Additional copy parameters to pass to the command. For example: ["STATUPDATE ON"]
1302
1330
boto3_session : boto3.Session(), optional
1303
1331
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
1304
- s3_additional_kwargs:
1332
+ s3_additional_kwargs : Dict[str, str], optional
1305
1333
Forwarded to botocore requests.
1306
1334
e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
1335
+ precombine_key : str, optional
1336
+ When there is a primary_key match during upsert, this column will change the upsert method,
1337
+ comparing the values of the specified column from source and target, and keeping the
1338
+ larger of the two. Will only work when mode = upsert.
1307
1339
1308
1340
Returns
1309
1341
-------
@@ -1374,7 +1406,14 @@ def copy_from_files( # pylint: disable=too-many-locals,too-many-arguments
1374
1406
if table != created_table : # upsert
1375
1407
if lock :
1376
1408
_lock (cursor , [table ], schema = schema )
1377
- _upsert (cursor = cursor , schema = schema , table = table , temp_table = created_table , primary_keys = primary_keys )
1409
+ _upsert (
1410
+ cursor = cursor ,
1411
+ schema = schema ,
1412
+ table = table ,
1413
+ temp_table = created_table ,
1414
+ primary_keys = primary_keys ,
1415
+ precombine_key = precombine_key ,
1416
+ )
1378
1417
if commit_transaction :
1379
1418
con .commit ()
1380
1419
except Exception as ex :
@@ -1440,7 +1479,7 @@ def copy( # pylint: disable=too-many-arguments
1440
1479
1441
1480
Parameters
1442
1481
----------
1443
- df: pandas.DataFrame
1482
+ df : pandas.DataFrame
1444
1483
Pandas DataFrame.
1445
1484
path : str
1446
1485
S3 path to write stage files (e.g. s3://bucket_name/any_name/).
@@ -1462,12 +1501,12 @@ def copy( # pylint: disable=too-many-arguments
1462
1501
The session key for your AWS account. This is only needed when you are using temporary credentials.
1463
1502
index : bool
1464
1503
True to store the DataFrame index in file, otherwise False to ignore it.
1465
- dtype: Dict[str, str], optional
1504
+ dtype : Dict[str, str], optional
1466
1505
Dictionary of columns names and Athena/Glue types to be casted.
1467
1506
Useful when you have columns with undetermined or mixed data types.
1468
1507
Only takes effect if dataset=True.
1469
1508
(e.g. {'col name': 'bigint', 'col2 name': 'int'})
1470
- mode: str
1509
+ mode : str
1471
1510
Append, overwrite or upsert.
1472
1511
overwrite_method : str
1473
1512
Drop, cascade, truncate, or delete. Only applicable in overwrite mode.
@@ -1477,7 +1516,7 @@ def copy( # pylint: disable=too-many-arguments
1477
1516
"truncate" - ``TRUNCATE ...`` - truncates the table, but immediatly commits current
1478
1517
transaction & starts a new one, hence the overwrite happens in two transactions and is not atomic.
1479
1518
"delete" - ``DELETE FROM ...`` - deletes all rows from the table. Slow relative to the other methods.
1480
- diststyle: str
1519
+ diststyle : str
1481
1520
Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"].
1482
1521
https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
1483
1522
distkey : str, optional
@@ -1501,11 +1540,11 @@ def copy( # pylint: disable=too-many-arguments
1501
1540
If integer is provided, specified number is used.
1502
1541
lock : bool
1503
1542
True to execute LOCK command inside the transaction to force serializable isolation.
1504
- sql_copy_extra_params: Optional[List[str]]
1543
+ sql_copy_extra_params : Optional[List[str]]
1505
1544
Additional copy parameters to pass to the command. For example: ["STATUPDATE ON"]
1506
1545
boto3_session : boto3.Session(), optional
1507
1546
Boto3 Session. The default boto3 session will be used if boto3_session receive None.
1508
- s3_additional_kwargs:
1547
+ s3_additional_kwargs : Dict[str, str], optional
1509
1548
Forwarded to botocore requests.
1510
1549
e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
1511
1550
max_rows_by_file : int
0 commit comments