@@ -109,16 +109,8 @@ def process_documents_in_es(index_exists, shards, http_auth,model_id):
109
109
os_http_auth = http_auth )
110
110
111
111
def process_documents_in_aoss (index_exists , shards , http_auth ,model_id ):
112
- # Reference: https://python.langchain.com/docs/integrations/vectorstores/opensearch#using-aoss-amazon-opensearch-service-serverless
113
112
bedrock_client = boto3 .client ('bedrock-runtime' )
114
- # if(model_id=='amazon.titan-embed-image-v1'):
115
- # print(f'image embeddings shards[0] {shards}')
116
- # embeddings = image_loader.BedrockEmbeddings_image(docs=shards[0], model_id=model_id,)
117
- # else:
118
- # embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
119
113
embeddings = BedrockEmbeddings (client = bedrock_client ,model_id = model_id )
120
-
121
- print (f' check index with :: { shards [0 ]} ' )
122
114
123
115
shard_start_index = 0
124
116
if index_exists is False :
@@ -166,7 +158,7 @@ def handler(event, context: LambdaContext) -> dict:
166
158
job_id = event [0 ]['s3_transformer_result' ]['Payload' ]['jobid' ]
167
159
modelid = event [0 ]['s3_transformer_result' ]['Payload' ]['modelid' ]
168
160
169
- print (f' model id :: { modelid } ' )
161
+ logger . info (f' model id :: { modelid } ' )
170
162
171
163
logger .set_correlation_id (job_id )
172
164
metrics .add_metadata (key = 'correlationId' , value = job_id )
@@ -181,7 +173,7 @@ def handler(event, context: LambdaContext) -> dict:
181
173
182
174
183
175
184
- print (f'Loading txt raw files from { bucket_name } ' )
176
+ logger . info (f'Loading txt raw files from { bucket_name } ' )
185
177
186
178
docs = []
187
179
@@ -228,7 +220,6 @@ def process_text_embeddings(docs,modelid,http_auth,files,job_id):
228
220
# we can augment data here probably (PII present ? ...)
229
221
for doc in docs :
230
222
doc .metadata ['timestamp' ] = time .time ()
231
- # doc.metadata['embeddings_model'] = 'amazon.titan-embed-text-v1'
232
223
doc .metadata ['embeddings_model' ] = modelid
233
224
chunks = text_splitter .create_documents ([doc .page_content for doc in docs ], metadatas = [doc .metadata for doc in docs ])
234
225
@@ -276,7 +267,7 @@ def process_image_embeddings(docs,modelid,http_auth,files,job_id,url):
276
267
for doc in docs :
277
268
doc .metadata ['timestamp' ] = time .time ()
278
269
doc .metadata ['embeddings_model' ] = modelid
279
-
270
+ # not using text splitter , using whole image embedding as one array
280
271
shards = np .array_split (docs ,1 )
281
272
282
273
try :
0 commit comments