Skip to content

Commit b1205b8

Browse files
statefbTakehiro Suzukikrokoko
authored
feat(opensearch serverless): analyzer (#537)
feat(oss): analyzer --------- Co-authored-by: Takehiro Suzuki <[email protected]> Co-authored-by: Alain Krok <[email protected]>
1 parent e731700 commit b1205b8

File tree

14 files changed

+712
-245
lines changed

14 files changed

+712
-245
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearchserverless](../modules/opensearchserverless.md) / CharacterFilterType
2+
3+
# Enumeration: CharacterFilterType
4+
5+
[opensearchserverless](../modules/opensearchserverless.md).CharacterFilterType
6+
7+
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
8+
9+
Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
10+
with the License. A copy of the License is located at
11+
12+
http://www.apache.org/licenses/LICENSE-2.0
13+
14+
or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
15+
OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
16+
and limitations under the License.
17+
18+
## Table of contents
19+
20+
### Enumeration Members
21+
22+
- [ICU\_NORMALIZER](opensearchserverless.CharacterFilterType.md#icu_normalizer)
23+
24+
## Enumeration Members
25+
26+
### ICU\_NORMALIZER
27+
28+
**ICU\_NORMALIZER** = ``"icu_normalizer"``
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearchserverless](../modules/opensearchserverless.md) / TokenFilterType
2+
3+
# Enumeration: TokenFilterType
4+
5+
[opensearchserverless](../modules/opensearchserverless.md).TokenFilterType
6+
7+
## Table of contents
8+
9+
### Enumeration Members
10+
11+
- [CJK\_WIDTH](opensearchserverless.TokenFilterType.md#cjk_width)
12+
- [ICU\_FOLDING](opensearchserverless.TokenFilterType.md#icu_folding)
13+
- [JA\_STOP](opensearchserverless.TokenFilterType.md#ja_stop)
14+
- [KUROMOJI\_BASEFORM](opensearchserverless.TokenFilterType.md#kuromoji_baseform)
15+
- [KUROMOJI\_PART\_OF\_SPEECH](opensearchserverless.TokenFilterType.md#kuromoji_part_of_speech)
16+
- [KUROMOJI\_STEMMER](opensearchserverless.TokenFilterType.md#kuromoji_stemmer)
17+
- [LOWERCASE](opensearchserverless.TokenFilterType.md#lowercase)
18+
19+
## Enumeration Members
20+
21+
### CJK\_WIDTH
22+
23+
**CJK\_WIDTH** = ``"cjk_width"``
24+
25+
___
26+
27+
### ICU\_FOLDING
28+
29+
**ICU\_FOLDING** = ``"icu_folding"``
30+
31+
___
32+
33+
### JA\_STOP
34+
35+
**JA\_STOP** = ``"ja_stop"``
36+
37+
___
38+
39+
### KUROMOJI\_BASEFORM
40+
41+
**KUROMOJI\_BASEFORM** = ``"kuromoji_baseform"``
42+
43+
___
44+
45+
### KUROMOJI\_PART\_OF\_SPEECH
46+
47+
**KUROMOJI\_PART\_OF\_SPEECH** = ``"kuromoji_part_of_speech"``
48+
49+
___
50+
51+
### KUROMOJI\_STEMMER
52+
53+
**KUROMOJI\_STEMMER** = ``"kuromoji_stemmer"``
54+
55+
___
56+
57+
### LOWERCASE
58+
59+
**LOWERCASE** = ``"lowercase"``
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearchserverless](../modules/opensearchserverless.md) / TokenizerType
2+
3+
# Enumeration: TokenizerType
4+
5+
[opensearchserverless](../modules/opensearchserverless.md).TokenizerType
6+
7+
## Table of contents
8+
9+
### Enumeration Members
10+
11+
- [ICU\_TOKENIZER](opensearchserverless.TokenizerType.md#icu_tokenizer)
12+
- [KUROMOJI\_TOKENIZER](opensearchserverless.TokenizerType.md#kuromoji_tokenizer)
13+
14+
## Enumeration Members
15+
16+
### ICU\_TOKENIZER
17+
18+
**ICU\_TOKENIZER** = ``"icu_tokenizer"``
19+
20+
___
21+
22+
### KUROMOJI\_TOKENIZER
23+
24+
**KUROMOJI\_TOKENIZER** = ``"kuromoji_tokenizer"``
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
[@cdklabs/generative-ai-cdk-constructs](../README.md) / [opensearch\_vectorindex](../modules/opensearch_vectorindex.md) / Analyzer
2+
3+
# Interface: Analyzer
4+
5+
[opensearch\_vectorindex](../modules/opensearch_vectorindex.md).Analyzer
6+
7+
Properties for the Analyzer.
8+
9+
## Table of contents
10+
11+
### Properties
12+
13+
- [characterFilters](opensearch_vectorindex.Analyzer.md#characterfilters)
14+
- [tokenFilters](opensearch_vectorindex.Analyzer.md#tokenfilters)
15+
- [tokenizer](opensearch_vectorindex.Analyzer.md#tokenizer)
16+
17+
## Properties
18+
19+
### characterFilters
20+
21+
`Readonly` **characterFilters**: [`ICU_NORMALIZER`](../enums/opensearchserverless.CharacterFilterType.md#icu_normalizer)[]
22+
23+
The analyzers to use.
24+
25+
___
26+
27+
### tokenFilters
28+
29+
`Readonly` **tokenFilters**: [`TokenFilterType`](../enums/opensearchserverless.TokenFilterType.md)[]
30+
31+
The token filters to use.
32+
33+
___
34+
35+
### tokenizer
36+
37+
`Readonly` **tokenizer**: [`TokenizerType`](../enums/opensearchserverless.TokenizerType.md)
38+
39+
The tokenizer to use.

apidocs/interfaces/opensearch_vectorindex.VectorIndexProps.md

+15
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Properties for the VectorIndex.
1010

1111
### Properties
1212

13+
- [analyzer](opensearch_vectorindex.VectorIndexProps.md#analyzer)
1314
- [collection](opensearch_vectorindex.VectorIndexProps.md#collection)
1415
- [indexName](opensearch_vectorindex.VectorIndexProps.md#indexname)
1516
- [mappings](opensearch_vectorindex.VectorIndexProps.md#mappings)
@@ -18,6 +19,20 @@ Properties for the VectorIndex.
1819

1920
## Properties
2021

22+
### analyzer
23+
24+
`Optional` `Readonly` **analyzer**: [`Analyzer`](opensearch_vectorindex.Analyzer.md)
25+
26+
The analyzer to use.
27+
28+
**`Default`**
29+
30+
```ts
31+
- No analyzer.
32+
```
33+
34+
___
35+
2136
### collection
2237

2338
`Readonly` **collection**: [`VectorCollection`](../classes/opensearchserverless.VectorCollection.md)

apidocs/modules/opensearch_vectorindex.md

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
### Interfaces
1212

13+
- [Analyzer](../interfaces/opensearch_vectorindex.Analyzer.md)
1314
- [MetadataManagementFieldProps](../interfaces/opensearch_vectorindex.MetadataManagementFieldProps.md)
1415
- [VectorIndexProps](../interfaces/opensearch_vectorindex.VectorIndexProps.md)
1516

apidocs/modules/opensearchserverless.md

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66

77
### Enumerations
88

9+
- [CharacterFilterType](../enums/opensearchserverless.CharacterFilterType.md)
10+
- [TokenFilterType](../enums/opensearchserverless.TokenFilterType.md)
11+
- [TokenizerType](../enums/opensearchserverless.TokenizerType.md)
912
- [VectorCollectionStandbyReplicas](../enums/opensearchserverless.VectorCollectionStandbyReplicas.md)
1013

1114
### Classes

lambda/opensearch-serverless-custom-resources/custom_resources/opensearch_index.py

+57-19
Original file line numberDiff line numberDiff line change
@@ -11,27 +11,26 @@
1111
# and limitations under the License.
1212
#
1313

14+
import logging
15+
import os
16+
import time
17+
from typing import Sequence, TypedDict
18+
19+
import boto3
20+
from custom_resources.cr_types import CustomResourceRequest, CustomResourceResponse
1421
from opensearchpy import (
22+
AuthorizationException,
23+
AWSV4SignerAuth,
1524
OpenSearch,
1625
RequestsHttpConnection,
17-
AWSV4SignerAuth,
18-
AuthorizationException,
1926
)
20-
import boto3
21-
import logging
22-
import os
23-
import time
2427
from tenacity import (
2528
retry,
2629
retry_if_exception_type,
2730
stop_after_attempt,
2831
wait_exponential_jitter,
2932
)
3033

31-
from typing import TypedDict, Sequence
32-
33-
from custom_resources.cr_types import CustomResourceRequest, CustomResourceResponse
34-
3534
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
3635

3736
logger = logging.getLogger(__name__)
@@ -44,12 +43,19 @@ class MetadataManagementField(TypedDict):
4443
Filterable: bool
4544

4645

46+
class AnalyzerProperties(TypedDict):
47+
CharacterFilters: Sequence[str]
48+
Tokenizer: str
49+
TokenFilters: Sequence[str]
50+
51+
4752
class VectorIndexProperties(TypedDict):
4853
Endpoint: str
4954
IndexName: str
5055
VectorField: str
5156
Dimensions: int | str
5257
MetadataManagement: Sequence[MetadataManagementField]
58+
Analyzer: AnalyzerProperties | None
5359

5460

5561
def validate_event(event: CustomResourceRequest[VectorIndexProperties]) -> bool:
@@ -70,6 +76,14 @@ def validate_event(event: CustomResourceRequest[VectorIndexProperties]) -> bool:
7076
raise ValueError("MetadataManagement is required")
7177
if event["RequestType"] == "Update" and event["PhysicalResourceId"] is None:
7278
raise ValueError("PhysicalResourceId is required")
79+
if event["ResourceProperties"].get("Analyzer") is not None:
80+
analyzer = event["ResourceProperties"]["Analyzer"]
81+
if analyzer["CharacterFilters"] is None:
82+
raise ValueError("CharacterFilters is required")
83+
if analyzer["Tokenizer"] is None:
84+
raise ValueError("Tokenizer is required")
85+
if analyzer["TokenFilters"] is None:
86+
raise ValueError("TokenFilters is required")
7387
elif event["RequestType"] == "Delete":
7488
if event["PhysicalResourceId"] is None:
7589
raise ValueError("PhysicalResourceId is required")
@@ -139,18 +153,39 @@ def create_mapping(
139153
return mapping
140154

141155

142-
def create_index(client: OpenSearch, index_name: str, mapping: dict[str, str]) -> None:
156+
def create_setting(analyzer: AnalyzerProperties | None) -> dict:
157+
setting = {
158+
"index": {
159+
"number_of_shards": "2",
160+
"knn.algo_param": {"ef_search": "512"},
161+
"knn": "true",
162+
},
163+
}
164+
if analyzer:
165+
setting["analysis"] = {
166+
"analyzer": {
167+
"custom_analyzer": {
168+
"type": "custom",
169+
"tokenizer": analyzer["Tokenizer"],
170+
"char_filter": analyzer["CharacterFilters"],
171+
"filter": analyzer["TokenFilters"],
172+
}
173+
}
174+
}
175+
176+
return setting
177+
178+
179+
def create_index(
180+
client: OpenSearch, index_name: str, mapping: dict[str, str], setting: dict[str, str]
181+
) -> None:
143182
logger.debug(f"creating index {index_name}")
183+
logger.debug(f"setting: {setting}")
184+
logger.debug(f"mapping: {mapping}")
144185
client.indices.create(
145186
index_name,
146187
body={
147-
"settings": {
148-
"index": {
149-
"number_of_shards": "2",
150-
"knn.algo_param": {"ef_search": "512"},
151-
"knn": "true",
152-
}
153-
},
188+
"settings": setting,
154189
"mappings": mapping,
155190
},
156191
params={"wait_for_active_shards": "all"},
@@ -171,13 +206,15 @@ def handle_create(
171206
vector_field: str,
172207
dimensions: int,
173208
metadata_management: Sequence[MetadataManagementField],
209+
analyzer: AnalyzerProperties | None,
174210
):
175211
if client.indices.exists(index_name):
176212
raise ValueError(f"Index {index_name} already exists")
177213

178214
try:
179215
mapping = create_mapping(vector_field, dimensions, metadata_management)
180-
create_index(client, index_name, mapping)
216+
setting = create_setting(analyzer)
217+
create_index(client, index_name, mapping, setting)
181218
except Exception as e:
182219
logger.error(f"Error creating index {index_name}")
183220
logger.exception(e)
@@ -211,6 +248,7 @@ def on_create(
211248
event["ResourceProperties"]["VectorField"],
212249
int(event["ResourceProperties"]["Dimensions"]),
213250
event["ResourceProperties"]["MetadataManagement"],
251+
event["ResourceProperties"].get("Analyzer", None),
214252
)
215253
return {"PhysicalResourceId": physical_id}
216254

0 commit comments

Comments
 (0)