Skip to content

Commit febc38d

Browse files
authored
chore(readme): add python code snippet for bedrock knowledge base data source (#743)
1 parent f42e9c7 commit febc38d

File tree

1 file changed

+194
-12
lines changed

1 file changed

+194
-12
lines changed

src/cdk-lib/bedrock/README.md

+194-12
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,6 @@ bedrock.S3DataSource(self, 'DataSource',
9797
knowledge_base=kb,
9898
data_source_name='books',
9999
chunking_strategy= bedrock.ChunkingStrategy.FIXED_SIZE,
100-
max_tokens=500,
101-
overlap_percentage=20
102100
)
103101

104102
```
@@ -226,8 +224,6 @@ new bedrock.S3DataSource(this, "DataSource", {
226224
knowledgeBase: kb,
227225
dataSourceName: "books",
228226
chunkingStrategy: bedrock.ChunkingStrategy.FIXED_SIZE,
229-
maxTokens: 500,
230-
overlapPercentage: 20,
231227
});
232228
```
233229

@@ -287,9 +283,7 @@ bedrock.S3DataSource(self, 'DataSource',
287283
bucket= docBucket,
288284
knowledge_base=kb,
289285
data_source_name='books',
290-
chunking_strategy= bedrock.ChunkingStrategy.FIXED_SIZE,
291-
max_tokens=500,
292-
overlap_percentage=20
286+
chunking_strategy= bedrock.ChunkingStrategy.FIXED_SIZE,
293287
)
294288
```
295289

@@ -323,8 +317,6 @@ new bedrock.S3DataSource(this, "DataSource", {
323317
knowledgeBase: kb,
324318
dataSourceName: "books",
325319
chunkingStrategy: bedrock.ChunkingStrategy.FIXED_SIZE,
326-
maxTokens: 500,
327-
overlapPercentage: 20,
328320
});
329321
```
330322

@@ -361,8 +353,6 @@ bedrock.S3DataSource(self, 'DataSource',
361353
knowledge_base=kb,
362354
data_source_name='books',
363355
chunking_strategy= bedrock.ChunkingStrategy.FIXED_SIZE,
364-
max_tokens=500,
365-
overlap_percentage=20
366356
)
367357
```
368358

@@ -482,19 +472,139 @@ kb.addSharePointDataSource({
482472
});
483473
```
484474

475+
Python
476+
477+
```python
478+
from aws_cdk import (
479+
Stack,
480+
aws_s3 as s3,
481+
aws_lambda as _lambda,
482+
aws_secretsmanager as secretsmanager,
483+
aws_kms as kms
484+
)
485+
from constructs import Construct
486+
from cdklabs.generative_ai_cdk_constructs import (
487+
bedrock
488+
)
489+
490+
class PythonTestStack(Stack):
491+
492+
def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
493+
super().__init__(scope, construct_id, **kwargs)
494+
495+
kb = bedrock.KnowledgeBase(self, 'MyKnowledgeBase',
496+
embeddings_model= bedrock.BedrockFoundationModel.COHERE_EMBED_MULTILINGUAL_V3,
497+
)
498+
499+
docBucket = s3.Bucket(self, 'Bucket')
500+
501+
function = _lambda.Function(self, 'MyFunction',
502+
runtime=_lambda.Runtime.PYTHON_3_12,
503+
handler='index.handler',
504+
code=_lambda.Code.from_inline('print("Hello, World!")'),
505+
)
506+
507+
kb.add_web_crawler_data_source(
508+
source_urls= ['https://docs.aws.amazon.com/'],
509+
chunking_strategy= bedrock.ChunkingStrategy.HIERARCHICAL_COHERE,
510+
custom_transformation= bedrock.CustomTransformation.lambda_(
511+
lambda_function= function,
512+
s3_bucket_uri= f's3://{docBucket.bucket_name}/chunk-processor/'
513+
)
514+
)
515+
516+
kb.add_s3_data_source(
517+
bucket= docBucket,
518+
chunking_strategy= bedrock.ChunkingStrategy.SEMANTIC,
519+
parsing_strategy= bedrock.ParsingStategy.foundation_model(
520+
parsing_model= bedrock.BedrockFoundationModel.ANTHROPIC_CLAUDE_3_5_SONNET_V1_0.as_i_model(self)
521+
)
522+
)
523+
524+
secret = secretsmanager.Secret(self, 'Secret')
525+
key = kms.Key(self, 'Key')
526+
527+
kb.add_confluence_data_source(
528+
data_source_name='TestDataSource',
529+
auth_secret=secret,
530+
kms_key=key,
531+
confluence_url='https://example.atlassian.net',
532+
filters=[
533+
bedrock.ConfluenceCrawlingFilters(
534+
object_type=bedrock.ConfluenceObjectType.ATTACHMENT,
535+
include_patterns= [".*\\.pdf"],
536+
exclude_patterns= [".*private.*\\.pdf"],
537+
),
538+
bedrock.ConfluenceCrawlingFilters(
539+
object_type=bedrock.ConfluenceObjectType.PAGE,
540+
include_patterns= [".*public.*\\.pdf"],
541+
exclude_patterns= [".*confidential.*\\.pdf"],
542+
),
543+
]
544+
)
545+
546+
kb.add_salesforce_data_source(
547+
auth_secret=secret,
548+
endpoint='https://your-instance.my.salesforce.com',
549+
kms_key=key,
550+
filters=[
551+
bedrock.SalesforceCrawlingFilters(
552+
object_type=bedrock.SalesforceObjectType.ATTACHMENT,
553+
include_patterns= [".*\\.pdf"],
554+
exclude_patterns= [".*private.*\\.pdf"],
555+
),
556+
bedrock.SalesforceCrawlingFilters(
557+
object_type=bedrock.SalesforceObjectType.CONTRACT,
558+
include_patterns= [".*public.*\\.pdf"],
559+
exclude_patterns= [".*confidential.*\\.pdf"],
560+
),
561+
]
562+
)
563+
564+
kb.add_share_point_data_source(
565+
data_source_name='SharepointDataSource',
566+
auth_secret=secret,
567+
kms_key=key,
568+
domain='yourDomain',
569+
site_urls= ['https://yourdomain.sharepoint.com/sites/mysite'],
570+
tenant_id='888d0b57-69f1-4fb8-957f-e1f0bedf64de',
571+
filters=[
572+
bedrock.SharePointCrawlingFilters(
573+
object_type=bedrock.SharePointObjectType.PAGE,
574+
include_patterns= [".*\\.pdf"],
575+
exclude_patterns= [".*private.*\\.pdf"],
576+
),
577+
bedrock.SharePointCrawlingFilters(
578+
object_type=bedrock.SharePointObjectType.FILE,
579+
include_patterns= [".*public.*\\.pdf"],
580+
exclude_patterns= [".*confidential.*\\.pdf"],
581+
),
582+
]
583+
)
584+
585+
```
586+
485587
#### Knowledge Base - Chunking Strategies
486588

487589
- **Default Chunking**: Applies Fixed Chunking with the default chunk size of 300 tokens and 20% overlap.
488590

591+
TypeScript
489592
```ts
490593
ChunkingStrategy.DEFAULT;
491594
```
492595

596+
Python
597+
598+
```python
599+
ChunkingStrategy.DEFAULT;
600+
```
601+
493602
- **Fixed Size Chunking**: This method divides the data into fixed-size chunks, with each chunk
494603
containing a predetermined number of tokens. This strategy is useful when the data is uniform
495604
in size and structure.
496605
Typescript
497606

607+
TypeScript
498608
```ts
499609
// Fixed Size Chunking with sane defaults.
500610
ChunkingStrategy.FIXED_SIZE;
@@ -503,10 +613,24 @@ kb.addSharePointDataSource({
503613
ChunkingStrategy.fixedSize({ maxTokens: 200, overlapPercentage: 25 });
504614
```
505615

616+
Python
617+
618+
```python
619+
# Fixed Size Chunking with sane defaults.
620+
ChunkingStrategy.FIXED_SIZE;
621+
622+
# Fixed Size Chunking with custom values.
623+
ChunkingStrategy.fixed_size(
624+
max_tokens= 200,
625+
overlap_percentage= 25
626+
)
627+
```
628+
506629
- **Hierarchical Chunking**: This strategy organizes data into layers of chunks, with the first
507630
layer containing large chunks and the second layer containing smaller chunks derived from the first.
508631
It is ideal for data with inherent hierarchies or nested structures.
509632

633+
TypeScript
510634
```ts
511635
// Hierarchical Chunking with the default for Cohere Models.
512636
ChunkingStrategy.HIERARCHICAL_COHERE;
@@ -523,10 +647,29 @@ kb.addSharePointDataSource({
523647
});
524648
```
525649

650+
Python
651+
652+
```python
653+
# Hierarchical Chunking with the default for Cohere Models.
654+
ChunkingStrategy.HIERARCHICAL_COHERE
655+
656+
# Hierarchical Chunking with the default for Titan Models.
657+
ChunkingStrategy.HIERARCHICAL_TITAN
658+
659+
# Hierarchical Chunking with custom values. Tthe maximum chunk size depends on the model.
660+
# Amazon Titan Text Embeddings: 8192. Cohere Embed models: 512
661+
chunking_strategy= ChunkingStrategy.hierarchical(
662+
overlap_tokens=60,
663+
max_parent_token_size=1500,
664+
max_child_token_size=300
665+
)
666+
```
667+
526668
- **Semantic Chunking**: This method splits data into smaller documents based on groups of similar
527669
content derived from the text using natural language processing. It helps preserve contextual
528670
relationships and ensures accurate and contextually appropriate results.
529671

672+
TypeScript
530673
```ts
531674
// Semantic Chunking with sane defaults.
532675
ChunkingStrategy.SEMANTIC;
@@ -535,13 +678,34 @@ kb.addSharePointDataSource({
535678
ChunkingStrategy.semantic({ bufferSize: 0, breakpointPercentileThreshold: 95, maxTokens: 300 });
536679
```
537680

681+
Python
682+
683+
```python
684+
# Semantic Chunking with sane defaults.
685+
ChunkingStrategy.SEMANTIC
686+
687+
# Semantic Chunking with custom values.
688+
ChunkingStrategy.semantic(
689+
buffer_size=0,
690+
breakpoint_percentile_threshold=95,
691+
max_tokens=300
692+
)
693+
```
694+
538695
- **No Chunking**: This strategy treats each file as one chunk. If you choose this option,
539696
you may want to pre-process your documents by splitting them into separate files.
540697

698+
TypeScript
541699
```ts
542700
ChunkingStrategy.NONE;
543701
```
544702

703+
Python
704+
705+
```python
706+
ChunkingStrategy.NONE;
707+
```
708+
545709
#### Knowledge Base - Parsing Strategy
546710

547711
A parsing strategy in Amazon Bedrock is a configuration that determines how the service
@@ -557,12 +721,21 @@ two parsing strategies:
557721
the contents of the document. It is particularly useful for improved processing of PDF files
558722
with tables and images. To use this strategy, set the `parsingStrategy` in a data source as below.
559723

724+
TypeScript
560725
```ts
561726
bedrock.ParsingStategy.foundationModel({
562727
model: BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0.asIModel(stack),
563728
});
564729
```
565730

731+
Python
732+
733+
```python
734+
bedrock.ParsingStategy.foundation_model(
735+
parsing_model=BedrockFoundationModel.ANTHROPIC_CLAUDE_SONNET_V1_0.as_i_model(self)
736+
)
737+
```
738+
566739
#### Knowledge Base - Custom Transformation
567740

568741
Custom Transformation in Amazon Bedrock is a feature that allows you to create and apply
@@ -572,13 +745,22 @@ Custom Transformation uses AWS Lambda functions to process documents, enabling y
572745
perform custom operations such as data extraction, normalization, or enrichment. To
573746
create a custom transformation, set the `customTransformation` in a data source as below.
574747

575-
```ts
748+
TypeScript
749+
```ts
576750
CustomTransformation.lambda({
577751
lambdaFunction: lambdaFunction,
578752
s3BucketUri: `s3://${bucket.bucketName}/chunk-processor/`,
579753
}),
580754
```
581755

756+
Python
757+
```python
758+
CustomTransformation.lambda_(
759+
lambda_function= function,
760+
s3_bucket_uri= f's3://{docBucket.bucket_name}/chunk-processor/'
761+
)
762+
```
763+
582764
## Agents
583765

584766
Enable generative AI applications to execute multistep tasks across company systems and data sources.

0 commit comments

Comments
 (0)