awslabs
diff --git a/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py
+120 b/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py
+120
diff --git a/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py
+52-3 b/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py
+52-3
diff --git a/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py
+74-26 b/‎lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py
+74-26
@@ -0,0 +1,120 @@
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+# with the License. A copy of the License is located at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
+# and limitations under the License.
+#
+import base64
+import json
+import os
+from typing import List
+from aiohttp import ClientError
+
+from aws_lambda_powertools import Logger, Tracer
+#from langchain_community.document_loaders.image import UnstructuredImageLoader
+from langchain.docstore.document import Document
+
+import boto3
+
+s3 = boto3.client('s3')
+
+logger = Logger(service="INGESTION_FILE_TRANSFORMER")
+tracer = Tracer(service="INGESTION_FILE_TRANSFORMER")
+
+@tracer.capture_method
+class image_loader():
+    """Loading logic for pdf documents from s3 ."""
+
+    def __init__(self, bucket: str, image_file: str,image_detail_file: str):
+        """Initialize with bucket and key name."""
+        self.bucket = bucket
+        self.image_file = image_file
+        self.image_detail_file = image_detail_file
+       
+
+    
+    @tracer.capture_method
+    def load(self):
+        """Load documents."""
+        try:
+            local_file_path = self.download_file(self.image_file)
+            print(f"file downloaded :: {local_file_path}")
+            
+            with open(f"{local_file_path}", "rb") as image_file:
+                input_image = base64.b64encode(image_file.read()).decode("utf8")
+            
+            s3 = boto3.resource('s3')
+            obj = s3.Object(self.bucket, self.image_detail_file)
+            raw_text = obj.get()['Body'].read().decode('utf-8') 
+
+            metadata = {"source": self.image_file}
+
+            docs = json.dumps({
+                    "inputImage": input_image,
+                    #"inputText": raw_text,
+                               })
+            #print(f'docs for titan embeddings {docs}')
+            return [Document(page_content=docs, metadata=metadata)]
+            
+        except Exception as exception:
+            logger.exception(f"Reason: {exception}")
+            return ""
+        
+    @tracer.capture_method
+    def get_presigned_url(self) -> str:
+        try:
+             url = s3.generate_presigned_url(
+                ClientMethod='get_object', 
+                Params={'Bucket': self.bucket, 'Key': self.image_file},
+                ExpiresIn=900
+                )
+             print(f"presigned url generated for {self.image_file} from {self.bucket}")
+             return url
+        except Exception as exception:
+            logger.exception(f"Reason: {exception}")
+            return ""
+        
+    @tracer.capture_method
+    def download_file(self,key )-> str:
+        try: 
+            file_path = "/tmp/" + os.path.basename(key)
+            s3.download_file(self.bucket, key,file_path)
+            print(f"file downloaded {file_path}")
+            return file_path
+        except ClientError as client_err:
+            print(f"Couldn\'t download file {client_err.response['Error']['Message']}")
+        
+        except Exception as exp:
+            print(f"Couldn\'t download file : {exp}")
+
+    @tracer.capture_method
+    def prepare_document_for_direct_load(self)->any:
+            local_file_path = self.download_file(self.image_file)
+            print(f" prepare os_document")
+            
+            with open(f"{local_file_path}", "rb") as image_file:
+                input_image = base64.b64encode(image_file.read()).decode("utf8")
+            
+            s3 = boto3.resource('s3')
+            obj = s3.Object(self.bucket, self.image_detail_file)
+            raw_text = obj.get()['Body'].read().decode('utf-8') 
+
+            metadata = {"source": self.image_file}
+
+            docs = json.dumps({
+                    "inputImage": input_image,
+                    #"inputText": raw_text,
+                               })
+            
+            os_document = {
+                "image_words": raw_text,
+                "image_vector": input_image,
+            }
+            print (f'os_document prepared ')
+            return os_document
@@ -37,12 +37,12 @@ def check_if_index_exists(index_name: str, region: str, host: str, http_auth: Tu
     print(f"index_name={index_name}, exists={exists}")
     return exists
 
-def process_shard(shard, os_index_name, os_domain_ep, os_http_auth) -> int: 
+def process_shard(shard, os_index_name, os_domain_ep, os_http_auth,model_id) -> int: 
     print(f'Starting process_shard of {len(shard)} chunks.')
     bedrock_client = boto3.client('bedrock-runtime')
     embeddings = BedrockEmbeddings(
         client=bedrock_client, 
-        model_id="amazon.titan-embed-text-v1")
+        model_id=model_id)
     opensearch_url = os_domain_ep if os_domain_ep.startswith("https://") else f"https://{os_domain_ep}"
     docsearch = OpenSearchVectorSearch(index_name=os_index_name,
                                        embedding_function=embeddings,
@@ -53,4 +53,53 @@ def process_shard(shard, os_index_name, os_domain_ep, os_http_auth) -> int:
                                        connection_class = RequestsHttpConnection)    
     docsearch.add_documents(documents=shard)
     print(f'Shard completed')
-    return 0
+    return 0
+
+# TODO - Use this to create index in OS if langchain community process_shard throws issues with image.
+# otherwise remove this function.
+def create_index_for_image(index_name: str, region: str, host: str, http_auth: Tuple[str, str],document):
+    # Create Index, generates a warning if index already exists
+    print(f' create index on os without langchain utility ')
+    aos_client = OpenSearch(
+        hosts = [{'host': host.replace("https://", ""), 'port': 443}],
+        http_auth = http_auth,
+        use_ssl = True,
+        verify_certs = True,
+        connection_class = RequestsHttpConnection
+    )
+
+     # Create an index
+    if not aos_client.indices.exists(index=index_name):
+        print(f' conection made , creating index....')
+        aos_client.indices.create(
+            index=index_name,
+            body={
+                "settings":{
+                "index.knn": True,
+                "index.knn.space_type": "cosinesimil",
+                "analysis": {
+                    "analyzer": {"default": {"type": "standard", "stopwords": "_english_"}}
+                },
+            },
+            "mappings":{
+                "properties": {
+                    "image_vector": {
+                        "type": "knn_vector",
+                        "dimension": len(document["image_vector"]),
+                        "store": True,
+                    },
+                    "image_path": {"type": "text", "store": True},
+                    "image_words": {"type": "text", "store": True},
+                    "celebrities": {"type": "text", "store": True},
+                }
+                }
+                }
+
+             )
+    else:
+         print(f' index already exist , document loading ....')
+    # Create documents
+    result=  aos_client.index(
+        index=index_name, body=document
+    )
+    print(' embeddings uploaded in os {result}')
@@ -18,7 +18,12 @@
 import numpy as np
 import tempfile
 from helpers.credentials_helper import get_credentials
-from helpers.opensearch_helper import check_if_index_exists, process_shard
+from helpers.csv_loader import csv_loader
+from helpers.image_loader import image_loader
+from helpers.msdoc_loader import msdoc_loader
+from helpers.html_loader import html_loader
+
+from helpers.opensearch_helper import check_if_index_exists, process_shard, create_index_for_image
 from helpers.update_ingestion_status import updateIngestionJobStatus
 from langchain_community.embeddings import BedrockEmbeddings
 from helpers.s3inmemoryloader import S3TxtFileLoaderInMemory
@@ -43,6 +48,7 @@
 
 opensearch_secret_id = os.environ['OPENSEARCH_SECRET_ID']
 bucket_name = os.environ['OUTPUT_BUCKET']
+# TODO: add input_bucket for csv|images
 opensearch_index = os.environ['OPENSEARCH_INDEX']
 opensearch_domain = os.environ['OPENSEARCH_DOMAIN_ENDPOINT']
 opensearch_api_name = os.environ['OPENSEARCH_API_NAME']
@@ -56,9 +62,10 @@
 PROCESS_COUNT=5
 INDEX_FILE="index_file"
 
-def process_documents_in_es(index_exists, shards, http_auth):
+def process_documents_in_es(index_exists, shards, http_auth,model_id):
     bedrock_client = boto3.client('bedrock-runtime')
-    embeddings = BedrockEmbeddings(client=bedrock_client)
+    embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
+    print(f' Bedrock embeddings model id :: {embeddings.model_id}')
 
     if index_exists is False:
         # create an index if the create index hint file exists
@@ -104,11 +111,13 @@ def process_documents_in_es(index_exists, shards, http_auth):
                     os_domain_ep=opensearch_domain,
                     os_http_auth=http_auth)
 
-def process_documents_in_aoss(index_exists, shards, http_auth):
+def process_documents_in_aoss(index_exists, shards, http_auth,model_id):
     # Reference: https://python.langchain.com/docs/integrations/vectorstores/opensearch#using-aoss-amazon-opensearch-service-serverless
     bedrock_client = boto3.client('bedrock-runtime')
-    embeddings = BedrockEmbeddings(client=bedrock_client)
-
+    embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
+   
+    print(f' Bedrock embeddings model id :: {embeddings.model_id}')
+   
     shard_start_index = 0
     if index_exists is False:
         OpenSearchVectorSearch.from_documents(
@@ -125,18 +134,19 @@ def process_documents_in_aoss(index_exists, shards, http_auth):
         )
         # we now need to start the loop below for the second shard
         shard_start_index = 1
-
+    print(f'statrt processing shard')
     for shard in shards[shard_start_index:]:
         results = process_shard(shard=shard,
                     os_index_name=opensearch_index,
                     os_domain_ep=opensearch_domain,
-                    os_http_auth=http_auth)
+                    os_http_auth=http_auth,
+                    model_id=model_id)
 
 @logger.inject_lambda_context(log_event=True)
 @tracer.capture_lambda_handler
 @metrics.log_metrics(capture_cold_start_metric=True)
 def handler(event,  context: LambdaContext) -> dict:
-
+    print(f'event {event}')
     # if the secret id is not provided
     # uses username password
     if opensearch_secret_id != 'NONE': # nosec
@@ -151,33 +161,61 @@ def handler(event,  context: LambdaContext) -> dict:
             session_token=credentials.token
         )
     job_id = event[0]['s3_transformer_result']['Payload']['jobid']
+    modelid = event[0]['s3_transformer_result']['Payload']['modelid']
+
+    print(f' model id :: {modelid}')
 
     logger.set_correlation_id(job_id)
     metrics.add_metadata(key='correlationId', value=job_id)
     tracer.put_annotation(key="correlationId", value=job_id)
 
     files = []
     for transformed_file in event:
-        files.append({'name':transformed_file['name'], 'status':transformed_file['s3_transformer_result']['Payload']['status']})
+        files.append({'name':transformed_file['name'],
+                       'status':transformed_file['s3_transformer_result']['Payload']['status'],
+                       'imageurl':''})
     updateIngestionJobStatus({'jobid': job_id, 'files': files})
 
+    
+    
     print(f'Loading txt raw files from {bucket_name}')
 
     docs = []
+    
+    # Images are stored in s3 with presigned url, embeddings is not required.
 
     for transformed_file in event:
+        print(f" staus :: {transformed_file['s3_transformer_result']['Payload']['status']}")
         if transformed_file['s3_transformer_result']['Payload']['status'] == 'File transformed':
             filename = transformed_file['s3_transformer_result']['Payload']['name']
-            loader = S3TxtFileLoaderInMemory(bucket_name, filename)
-            sub_docs = loader.load()
-            for doc in sub_docs:
-                doc.metadata['source'] = filename
-            docs.extend(sub_docs)
+            name, extension = os.path.splitext(filename)
+            print(f" the name {name} and extension {extension}")
+            # TODO: check file format , if pdf then read raw text from output bucket and update docs[]
+            # if csv|image then read file from input bucket using langchain document loader and update docs[]
+            if(extension == '.pdf'):
+                loader = S3TxtFileLoaderInMemory(bucket_name, filename)
+                sub_docs = loader.load()
+                for doc in sub_docs:
+                    doc.metadata['source'] = filename
+                docs.extend(sub_docs)
+            if(extension == '.jpg' or extension == '.jpeg' or extension == '.png'):
+                # Try adding text to document
+                #image_detal_file is created by aws rekognition
+                img_load = image_loader(bucket_name, f"{name}-resized.png",f"{name}.txt")
+                sub_docs = img_load.load()
+                for doc in sub_docs:
+                    doc.metadata['source'] = filename
+                docs.extend(sub_docs)
+                url = img_load.get_presigned_url()
+                print(f" url set :: {url} ")
+                print(f" prepare os object ")
+                os_document = img_load.prepare_document_for_direct_load()
+       
 
     if not docs:
-        return {
-            'status':'nothing to ingest'
-        }
+            return {
+                'status':'nothing to ingest'
+            }
 
     text_splitter = RecursiveCharacterTextSplitter(
                 # Set a really small chunk size, just to show.
@@ -192,18 +230,20 @@ def handler(event,  context: LambdaContext) -> dict:
     # we can augment data here probably (PII present ? ...)
     for doc in docs:
         doc.metadata['timestamp'] = time.time()
-        doc.metadata['embeddings_model'] = 'amazon.titan-embed-text-v1'
+       # doc.metadata['embeddings_model'] = 'amazon.titan-embed-text-v1'
+        doc.metadata['embeddings_model'] = modelid
     chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
 
     db_shards = (len(chunks) // MAX_OS_DOCS_PER_PUT) + 1
-    print(f'Loading chunks into vector store ... using {db_shards} shards')
     shards = np.array_split(chunks, db_shards)
 
     # first check if index exists, if it does then call the add_documents function
     # otherwise call the from_documents function which would first create the index
     # and then do a bulk add. Both add_documents and from_documents do a bulk add
     # but it is important to call from_documents first so that the index is created
     # correctly for K-NN
+    
+    print(f'check if index exists shards')
     try:
         index_exists = check_if_index_exists(opensearch_index,
                                                 aws_region,
@@ -218,19 +258,27 @@ def handler(event,  context: LambdaContext) -> dict:
             'status':'failed'
         }
 
-    if opensearch_api_name == "es":
-        process_documents_in_es(index_exists, shards, http_auth)
-    elif opensearch_api_name == "aoss":
-        process_documents_in_aoss(index_exists, shards, http_auth)
+    print(f'job_id :: {job_id}')
+    if(job_id=="101"):
+        print(f'running for job_id 101, use os directly')
+        create_index_for_image(os_document)
+    else:
+        print(f'Loading chunks into vector store ... using {db_shards} shards')
+        if opensearch_api_name == "es":
+            process_documents_in_es(index_exists, shards, http_auth,modelid)
+        elif opensearch_api_name == "aoss":
+            process_documents_in_aoss(index_exists, shards, http_auth,modelid)
 
+    
 
     for file in files:
         if file['status'] == 'File transformed':
-            file['status'] = 'Ingested'
+           file['status'] = 'Ingested'
+           file['imageurl'] = url
         else:
             file['status'] = 'Error_'+file['status']
     updateIngestionJobStatus({'jobid': job_id, 'files': files})
 
     return {
         'status':'succeed'
-    }
+    }