18
18
import numpy as np
19
19
import tempfile
20
20
from helpers .credentials_helper import get_credentials
21
- from helpers .opensearch_helper import check_if_index_exists , process_shard
21
+ from helpers .csv_loader import csv_loader
22
+ from helpers .image_loader import image_loader
23
+ from helpers .msdoc_loader import msdoc_loader
24
+ from helpers .html_loader import html_loader
25
+
26
+ from helpers .opensearch_helper import check_if_index_exists , process_shard , create_index_for_image
22
27
from helpers .update_ingestion_status import updateIngestionJobStatus
23
28
from langchain_community .embeddings import BedrockEmbeddings
24
29
from helpers .s3inmemoryloader import S3TxtFileLoaderInMemory
43
48
44
49
opensearch_secret_id = os .environ ['OPENSEARCH_SECRET_ID' ]
45
50
bucket_name = os .environ ['OUTPUT_BUCKET' ]
51
+ # TODO: add input_bucket for csv|images
46
52
opensearch_index = os .environ ['OPENSEARCH_INDEX' ]
47
53
opensearch_domain = os .environ ['OPENSEARCH_DOMAIN_ENDPOINT' ]
48
54
opensearch_api_name = os .environ ['OPENSEARCH_API_NAME' ]
56
62
PROCESS_COUNT = 5
57
63
INDEX_FILE = "index_file"
58
64
59
- def process_documents_in_es (index_exists , shards , http_auth ):
65
+ def process_documents_in_es (index_exists , shards , http_auth , model_id ):
60
66
bedrock_client = boto3 .client ('bedrock-runtime' )
61
- embeddings = BedrockEmbeddings (client = bedrock_client )
67
+ embeddings = BedrockEmbeddings (client = bedrock_client ,model_id = model_id )
68
+ print (f' Bedrock embeddings model id :: { embeddings .model_id } ' )
62
69
63
70
if index_exists is False :
64
71
# create an index if the create index hint file exists
@@ -104,11 +111,13 @@ def process_documents_in_es(index_exists, shards, http_auth):
104
111
os_domain_ep = opensearch_domain ,
105
112
os_http_auth = http_auth )
106
113
107
- def process_documents_in_aoss (index_exists , shards , http_auth ):
114
+ def process_documents_in_aoss (index_exists , shards , http_auth , model_id ):
108
115
# Reference: https://python.langchain.com/docs/integrations/vectorstores/opensearch#using-aoss-amazon-opensearch-service-serverless
109
116
bedrock_client = boto3 .client ('bedrock-runtime' )
110
- embeddings = BedrockEmbeddings (client = bedrock_client )
111
-
117
+ embeddings = BedrockEmbeddings (client = bedrock_client ,model_id = model_id )
118
+
119
+ print (f' Bedrock embeddings model id :: { embeddings .model_id } ' )
120
+
112
121
shard_start_index = 0
113
122
if index_exists is False :
114
123
OpenSearchVectorSearch .from_documents (
@@ -125,18 +134,19 @@ def process_documents_in_aoss(index_exists, shards, http_auth):
125
134
)
126
135
# we now need to start the loop below for the second shard
127
136
shard_start_index = 1
128
-
137
+ print ( f'statrt processing shard' )
129
138
for shard in shards [shard_start_index :]:
130
139
results = process_shard (shard = shard ,
131
140
os_index_name = opensearch_index ,
132
141
os_domain_ep = opensearch_domain ,
133
- os_http_auth = http_auth )
142
+ os_http_auth = http_auth ,
143
+ model_id = model_id )
134
144
135
145
@logger .inject_lambda_context (log_event = True )
136
146
@tracer .capture_lambda_handler
137
147
@metrics .log_metrics (capture_cold_start_metric = True )
138
148
def handler (event , context : LambdaContext ) -> dict :
139
-
149
+ print ( f'event { event } ' )
140
150
# if the secret id is not provided
141
151
# uses username password
142
152
if opensearch_secret_id != 'NONE' : # nosec
@@ -151,33 +161,61 @@ def handler(event, context: LambdaContext) -> dict:
151
161
session_token = credentials .token
152
162
)
153
163
job_id = event [0 ]['s3_transformer_result' ]['Payload' ]['jobid' ]
164
+ modelid = event [0 ]['s3_transformer_result' ]['Payload' ]['modelid' ]
165
+
166
+ print (f' model id :: { modelid } ' )
154
167
155
168
logger .set_correlation_id (job_id )
156
169
metrics .add_metadata (key = 'correlationId' , value = job_id )
157
170
tracer .put_annotation (key = "correlationId" , value = job_id )
158
171
159
172
files = []
160
173
for transformed_file in event :
161
- files .append ({'name' :transformed_file ['name' ], 'status' :transformed_file ['s3_transformer_result' ]['Payload' ]['status' ]})
174
+ files .append ({'name' :transformed_file ['name' ],
175
+ 'status' :transformed_file ['s3_transformer_result' ]['Payload' ]['status' ],
176
+ 'imageurl' :'' })
162
177
updateIngestionJobStatus ({'jobid' : job_id , 'files' : files })
163
178
179
+
180
+
164
181
print (f'Loading txt raw files from { bucket_name } ' )
165
182
166
183
docs = []
184
+
185
+ # Images are stored in s3 with presigned url, embeddings is not required.
167
186
168
187
for transformed_file in event :
188
+ print (f" staus :: { transformed_file ['s3_transformer_result' ]['Payload' ]['status' ]} " )
169
189
if transformed_file ['s3_transformer_result' ]['Payload' ]['status' ] == 'File transformed' :
170
190
filename = transformed_file ['s3_transformer_result' ]['Payload' ]['name' ]
171
- loader = S3TxtFileLoaderInMemory (bucket_name , filename )
172
- sub_docs = loader .load ()
173
- for doc in sub_docs :
174
- doc .metadata ['source' ] = filename
175
- docs .extend (sub_docs )
191
+ name , extension = os .path .splitext (filename )
192
+ print (f" the name { name } and extension { extension } " )
193
+ # TODO: check file format , if pdf then read raw text from output bucket and update docs[]
194
+ # if csv|image then read file from input bucket using langchain document loader and update docs[]
195
+ if (extension == '.pdf' ):
196
+ loader = S3TxtFileLoaderInMemory (bucket_name , filename )
197
+ sub_docs = loader .load ()
198
+ for doc in sub_docs :
199
+ doc .metadata ['source' ] = filename
200
+ docs .extend (sub_docs )
201
+ if (extension == '.jpg' or extension == '.jpeg' or extension == '.png' ):
202
+ # Try adding text to document
203
+ #image_detal_file is created by aws rekognition
204
+ img_load = image_loader (bucket_name , f"{ name } -resized.png" ,f"{ name } .txt" )
205
+ sub_docs = img_load .load ()
206
+ for doc in sub_docs :
207
+ doc .metadata ['source' ] = filename
208
+ docs .extend (sub_docs )
209
+ url = img_load .get_presigned_url ()
210
+ print (f" url set :: { url } " )
211
+ print (f" prepare os object " )
212
+ os_document = img_load .prepare_document_for_direct_load ()
213
+
176
214
177
215
if not docs :
178
- return {
179
- 'status' :'nothing to ingest'
180
- }
216
+ return {
217
+ 'status' :'nothing to ingest'
218
+ }
181
219
182
220
text_splitter = RecursiveCharacterTextSplitter (
183
221
# Set a really small chunk size, just to show.
@@ -192,18 +230,20 @@ def handler(event, context: LambdaContext) -> dict:
192
230
# we can augment data here probably (PII present ? ...)
193
231
for doc in docs :
194
232
doc .metadata ['timestamp' ] = time .time ()
195
- doc .metadata ['embeddings_model' ] = 'amazon.titan-embed-text-v1'
233
+ # doc.metadata['embeddings_model'] = 'amazon.titan-embed-text-v1'
234
+ doc .metadata ['embeddings_model' ] = modelid
196
235
chunks = text_splitter .create_documents ([doc .page_content for doc in docs ], metadatas = [doc .metadata for doc in docs ])
197
236
198
237
db_shards = (len (chunks ) // MAX_OS_DOCS_PER_PUT ) + 1
199
- print (f'Loading chunks into vector store ... using { db_shards } shards' )
200
238
shards = np .array_split (chunks , db_shards )
201
239
202
240
# first check if index exists, if it does then call the add_documents function
203
241
# otherwise call the from_documents function which would first create the index
204
242
# and then do a bulk add. Both add_documents and from_documents do a bulk add
205
243
# but it is important to call from_documents first so that the index is created
206
244
# correctly for K-NN
245
+
246
+ print (f'check if index exists shards' )
207
247
try :
208
248
index_exists = check_if_index_exists (opensearch_index ,
209
249
aws_region ,
@@ -218,19 +258,27 @@ def handler(event, context: LambdaContext) -> dict:
218
258
'status' :'failed'
219
259
}
220
260
221
- if opensearch_api_name == "es" :
222
- process_documents_in_es (index_exists , shards , http_auth )
223
- elif opensearch_api_name == "aoss" :
224
- process_documents_in_aoss (index_exists , shards , http_auth )
261
+ print (f'job_id :: { job_id } ' )
262
+ if (job_id == "101" ):
263
+ print (f'running for job_id 101, use os directly' )
264
+ create_index_for_image (os_document )
265
+ else :
266
+ print (f'Loading chunks into vector store ... using { db_shards } shards' )
267
+ if opensearch_api_name == "es" :
268
+ process_documents_in_es (index_exists , shards , http_auth ,modelid )
269
+ elif opensearch_api_name == "aoss" :
270
+ process_documents_in_aoss (index_exists , shards , http_auth ,modelid )
225
271
272
+
226
273
227
274
for file in files :
228
275
if file ['status' ] == 'File transformed' :
229
- file ['status' ] = 'Ingested'
276
+ file ['status' ] = 'Ingested'
277
+ file ['imageurl' ] = url
230
278
else :
231
279
file ['status' ] = 'Error_' + file ['status' ]
232
280
updateIngestionJobStatus ({'jobid' : job_id , 'files' : files })
233
281
234
282
return {
235
283
'status' :'succeed'
236
- }
284
+ }
0 commit comments