@@ -97,8 +97,6 @@ bedrock.S3DataSource(self, 'DataSource',
97
97
knowledge_base = kb,
98
98
data_source_name = ' books' ,
99
99
chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
100
- max_tokens = 500 ,
101
- overlap_percentage = 20
102
100
)
103
101
104
102
```
@@ -226,8 +224,6 @@ new bedrock.S3DataSource(this, "DataSource", {
226
224
knowledgeBase: kb ,
227
225
dataSourceName: " books" ,
228
226
chunkingStrategy: bedrock .ChunkingStrategy .FIXED_SIZE ,
229
- maxTokens: 500 ,
230
- overlapPercentage: 20 ,
231
227
});
232
228
```
233
229
@@ -287,9 +283,7 @@ bedrock.S3DataSource(self, 'DataSource',
287
283
bucket = docBucket,
288
284
knowledge_base = kb,
289
285
data_source_name = ' books' ,
290
- chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
291
- max_tokens = 500 ,
292
- overlap_percentage = 20
286
+ chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
293
287
)
294
288
```
295
289
@@ -323,8 +317,6 @@ new bedrock.S3DataSource(this, "DataSource", {
323
317
knowledgeBase: kb ,
324
318
dataSourceName: " books" ,
325
319
chunkingStrategy: bedrock .ChunkingStrategy .FIXED_SIZE ,
326
- maxTokens: 500 ,
327
- overlapPercentage: 20 ,
328
320
});
329
321
```
330
322
@@ -361,8 +353,6 @@ bedrock.S3DataSource(self, 'DataSource',
361
353
knowledge_base = kb,
362
354
data_source_name = ' books' ,
363
355
chunking_strategy = bedrock.ChunkingStrategy.FIXED_SIZE ,
364
- max_tokens = 500 ,
365
- overlap_percentage = 20
366
356
)
367
357
```
368
358
@@ -482,19 +472,139 @@ kb.addSharePointDataSource({
482
472
});
483
473
```
484
474
475
+ Python
476
+
477
+ ``` python
478
+ from aws_cdk import (
479
+ Stack,
480
+ aws_s3 as s3,
481
+ aws_lambda as _lambda,
482
+ aws_secretsmanager as secretsmanager,
483
+ aws_kms as kms
484
+ )
485
+ from constructs import Construct
486
+ from cdklabs.generative_ai_cdk_constructs import (
487
+ bedrock
488
+ )
489
+
490
+ class PythonTestStack (Stack ):
491
+
492
+ def __init__ (self , scope : Construct, construct_id : str , ** kwargs ) -> None :
493
+ super ().__init__ (scope, construct_id, ** kwargs)
494
+
495
+ kb = bedrock.KnowledgeBase(self , ' MyKnowledgeBase' ,
496
+ embeddings_model = bedrock.BedrockFoundationModel.COHERE_EMBED_MULTILINGUAL_V3 ,
497
+ )
498
+
499
+ docBucket = s3.Bucket(self , ' Bucket' )
500
+
501
+ function = _lambda.Function(self , ' MyFunction' ,
502
+ runtime = _lambda.Runtime.PYTHON_3_12 ,
503
+ handler = ' index.handler' ,
504
+ code = _lambda.Code.from_inline(' print("Hello, World!")' ),
505
+ )
506
+
507
+ kb.add_web_crawler_data_source(
508
+ source_urls = [' https://docs.aws.amazon.com/' ],
509
+ chunking_strategy = bedrock.ChunkingStrategy.HIERARCHICAL_COHERE ,
510
+ custom_transformation = bedrock.CustomTransformation.lambda_(
511
+ lambda_function = function,
512
+ s3_bucket_uri = f ' s3:// { docBucket.bucket_name} /chunk-processor/ '
513
+ )
514
+ )
515
+
516
+ kb.add_s3_data_source(
517
+ bucket = docBucket,
518
+ chunking_strategy = bedrock.ChunkingStrategy.SEMANTIC ,
519
+ parsing_strategy = bedrock.ParsingStategy.foundation_model(
520
+ parsing_model = bedrock.BedrockFoundationModel.ANTHROPIC_CLAUDE_3_5_SONNET_V1_0 .as_i_model(self )
521
+ )
522
+ )
523
+
524
+ secret = secretsmanager.Secret(self , ' Secret' )
525
+ key = kms.Key(self , ' Key' )
526
+
527
+ kb.add_confluence_data_source(
528
+ data_source_name = ' TestDataSource' ,
529
+ auth_secret = secret,
530
+ kms_key = key,
531
+ confluence_url = ' https://example.atlassian.net' ,
532
+ filters = [
533
+ bedrock.ConfluenceCrawlingFilters(
534
+ object_type = bedrock.ConfluenceObjectType.ATTACHMENT ,
535
+ include_patterns = [" .*\\ .pdf" ],
536
+ exclude_patterns = [" .*private.*\\ .pdf" ],
537
+ ),
538
+ bedrock.ConfluenceCrawlingFilters(
539
+ object_type = bedrock.ConfluenceObjectType.PAGE ,
540
+ include_patterns = [" .*public.*\\ .pdf" ],
541
+ exclude_patterns = [" .*confidential.*\\ .pdf" ],
542
+ ),
543
+ ]
544
+ )
545
+
546
+ kb.add_salesforce_data_source(
547
+ auth_secret = secret,
548
+ endpoint = ' https://your-instance.my.salesforce.com' ,
549
+ kms_key = key,
550
+ filters = [
551
+ bedrock.SalesforceCrawlingFilters(
552
+ object_type = bedrock.SalesforceObjectType.ATTACHMENT ,
553
+ include_patterns = [" .*\\ .pdf" ],
554
+ exclude_patterns = [" .*private.*\\ .pdf" ],
555
+ ),
556
+ bedrock.SalesforceCrawlingFilters(
557
+ object_type = bedrock.SalesforceObjectType.CONTRACT ,
558
+ include_patterns = [" .*public.*\\ .pdf" ],
559
+ exclude_patterns = [" .*confidential.*\\ .pdf" ],
560
+ ),
561
+ ]
562
+ )
563
+
564
+ kb.add_share_point_data_source(
565
+ data_source_name = ' SharepointDataSource' ,
566
+ auth_secret = secret,
567
+ kms_key = key,
568
+ domain = ' yourDomain' ,
569
+ site_urls = [' https://yourdomain.sharepoint.com/sites/mysite' ],
570
+ tenant_id = ' 888d0b57-69f1-4fb8-957f-e1f0bedf64de' ,
571
+ filters = [
572
+ bedrock.SharePointCrawlingFilters(
573
+ object_type = bedrock.SharePointObjectType.PAGE ,
574
+ include_patterns = [" .*\\ .pdf" ],
575
+ exclude_patterns = [" .*private.*\\ .pdf" ],
576
+ ),
577
+ bedrock.SharePointCrawlingFilters(
578
+ object_type = bedrock.SharePointObjectType.FILE ,
579
+ include_patterns = [" .*public.*\\ .pdf" ],
580
+ exclude_patterns = [" .*confidential.*\\ .pdf" ],
581
+ ),
582
+ ]
583
+ )
584
+
585
+ ```
586
+
485
587
#### Knowledge Base - Chunking Strategies
486
588
487
589
- ** Default Chunking** : Applies Fixed Chunking with the default chunk size of 300 tokens and 20% overlap.
488
590
591
+ TypeScript
489
592
``` ts
490
593
ChunkingStrategy .DEFAULT ;
491
594
```
492
595
596
+ Python
597
+
598
+ ``` python
599
+ ChunkingStrategy.DEFAULT ;
600
+ ```
601
+
493
602
- ** Fixed Size Chunking** : This method divides the data into fixed-size chunks, with each chunk
494
603
containing a predetermined number of tokens. This strategy is useful when the data is uniform
495
604
in size and structure.
496
605
Typescript
497
606
607
+ TypeScript
498
608
``` ts
499
609
// Fixed Size Chunking with sane defaults.
500
610
ChunkingStrategy .FIXED_SIZE ;
@@ -503,10 +613,24 @@ kb.addSharePointDataSource({
503
613
ChunkingStrategy .fixedSize ({ maxTokens: 200 , overlapPercentage: 25 });
504
614
```
505
615
616
+ Python
617
+
618
+ ``` python
619
+ # Fixed Size Chunking with sane defaults.
620
+ ChunkingStrategy.FIXED_SIZE ;
621
+
622
+ # Fixed Size Chunking with custom values.
623
+ ChunkingStrategy.fixed_size(
624
+ max_tokens = 200 ,
625
+ overlap_percentage = 25
626
+ )
627
+ ```
628
+
506
629
- ** Hierarchical Chunking** : This strategy organizes data into layers of chunks, with the first
507
630
layer containing large chunks and the second layer containing smaller chunks derived from the first.
508
631
It is ideal for data with inherent hierarchies or nested structures.
509
632
633
+ TypeScript
510
634
``` ts
511
635
// Hierarchical Chunking with the default for Cohere Models.
512
636
ChunkingStrategy .HIERARCHICAL_COHERE ;
@@ -523,10 +647,29 @@ kb.addSharePointDataSource({
523
647
});
524
648
```
525
649
650
+ Python
651
+
652
+ ``` python
653
+ # Hierarchical Chunking with the default for Cohere Models.
654
+ ChunkingStrategy.HIERARCHICAL_COHERE
655
+
656
+ # Hierarchical Chunking with the default for Titan Models.
657
+ ChunkingStrategy.HIERARCHICAL_TITAN
658
+
659
+ # Hierarchical Chunking with custom values. Tthe maximum chunk size depends on the model.
660
+ # Amazon Titan Text Embeddings: 8192. Cohere Embed models: 512
661
+ chunking_strategy= ChunkingStrategy.hierarchical(
662
+ overlap_tokens = 60 ,
663
+ max_parent_token_size = 1500 ,
664
+ max_child_token_size = 300
665
+ )
666
+ ```
667
+
526
668
- ** Semantic Chunking** : This method splits data into smaller documents based on groups of similar
527
669
content derived from the text using natural language processing. It helps preserve contextual
528
670
relationships and ensures accurate and contextually appropriate results.
529
671
672
+ TypeScript
530
673
``` ts
531
674
// Semantic Chunking with sane defaults.
532
675
ChunkingStrategy .SEMANTIC ;
@@ -535,13 +678,34 @@ kb.addSharePointDataSource({
535
678
ChunkingStrategy .semantic ({ bufferSize: 0 , breakpointPercentileThreshold: 95 , maxTokens: 300 });
536
679
```
537
680
681
+ Python
682
+
683
+ ``` python
684
+ # Semantic Chunking with sane defaults.
685
+ ChunkingStrategy.SEMANTIC
686
+
687
+ # Semantic Chunking with custom values.
688
+ ChunkingStrategy.semantic(
689
+ buffer_size = 0 ,
690
+ breakpoint_percentile_threshold = 95 ,
691
+ max_tokens = 300
692
+ )
693
+ ```
694
+
538
695
- ** No Chunking** : This strategy treats each file as one chunk. If you choose this option,
539
696
you may want to pre-process your documents by splitting them into separate files.
540
697
698
+ TypeScript
541
699
``` ts
542
700
ChunkingStrategy .NONE ;
543
701
```
544
702
703
+ Python
704
+
705
+ ``` python
706
+ ChunkingStrategy.NONE ;
707
+ ```
708
+
545
709
#### Knowledge Base - Parsing Strategy
546
710
547
711
A parsing strategy in Amazon Bedrock is a configuration that determines how the service
@@ -557,12 +721,21 @@ two parsing strategies:
557
721
the contents of the document. It is particularly useful for improved processing of PDF files
558
722
with tables and images. To use this strategy, set the ` parsingStrategy ` in a data source as below.
559
723
724
+ TypeScript
560
725
``` ts
561
726
bedrock .ParsingStategy .foundationModel ({
562
727
model: BedrockFoundationModel .ANTHROPIC_CLAUDE_SONNET_V1_0 .asIModel (stack ),
563
728
});
564
729
```
565
730
731
+ Python
732
+
733
+ ``` python
734
+ bedrock.ParsingStategy.foundation_model(
735
+ parsing_model = BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0 .as_i_model(self )
736
+ )
737
+ ```
738
+
566
739
#### Knowledge Base - Custom Transformation
567
740
568
741
Custom Transformation in Amazon Bedrock is a feature that allows you to create and apply
@@ -572,13 +745,22 @@ Custom Transformation uses AWS Lambda functions to process documents, enabling y
572
745
perform custom operations such as data extraction, normalization, or enrichment. To
573
746
create a custom transformation, set the ` customTransformation ` in a data source as below.
574
747
575
- ``` ts
748
+ TypeScript
749
+ ``` ts
576
750
CustomTransformation .lambda ({
577
751
lambdaFunction: lambdaFunction ,
578
752
s3BucketUri: ` s3://${bucket .bucketName }/chunk-processor/ ` ,
579
753
}),
580
754
```
581
755
756
+ Python
757
+ ``` python
758
+ CustomTransformation.lambda_(
759
+ lambda_function = function,
760
+ s3_bucket_uri = f ' s3:// { docBucket.bucket_name} /chunk-processor/ '
761
+ )
762
+ ```
763
+
582
764
## Agents
583
765
584
766
Enable generative AI applications to execute multistep tasks across company systems and data sources.
0 commit comments