feat(visualqa): merged dataingestion changes and updated image transformer

Dinesh Sajwan · Dinesh Sajwan · commit 9ee580e46e0a · 2024-02-14T11:35:19.000-05:00
diff --git a/lambda/aws-qa-appsync-opensearch/question_answering/src/llms/text_generation_llm_selector.py b/lambda/aws-qa-appsync-opensearch/question_answering/src/llms/text_generation_llm_selector.py
@@ -50,9 +50,9 @@ def get_llm(callbacks=None):
 
     return Bedrock(**kwargs)
 
-def get_embeddings_llm():
+def get_embeddings_llm(model_id):
     bedrock = boto3.client('bedrock-runtime')
-    return BedrockEmbeddings(client=bedrock, model_id="amazon.titan-embed-text-v1")
+    return BedrockEmbeddings(client=bedrock, model_id=model_id)
     
 def get_max_tokens():
     return 200000
diff --git a/lambda/aws-qa-appsync-opensearch/question_answering/src/qa_agent/chain.py b/lambda/aws-qa-appsync-opensearch/question_answering/src/qa_agent/chain.py
@@ -98,10 +98,11 @@ def run_question_answering(arguments):
 _doc_index = None
 _current_doc_index = None
 def run_qa_agent_rag_no_memory(input_params):
-    logger.info("starting qa agent with rag approach without memory")
+    logger.info("starting qa agent with rag approach without memory :: {input_params}")
 
     base64_bytes = input_params['question'].encode("utf-8")
-
+    model_id = input_params['embeddings_model']['modelId']
+    print(f'model id :: {model_id}')
     sample_string_bytes = base64.b64decode(base64_bytes)
     decoded_question = sample_string_bytes.decode("utf-8")
 
@@ -127,18 +128,20 @@ def run_qa_agent_rag_no_memory(input_params):
                                               os.environ.get('OPENSEARCH_API_NAME'),
                                               os.environ.get('OPENSEARCH_DOMAIN_ENDPOINT'),
                                               os.environ.get('OPENSEARCH_INDEX'),
-                                              os.environ.get('OPENSEARCH_SECRET_ID'))
+                                              os.environ.get('OPENSEARCH_SECRET_ID'),
+                                              model_id)
 
     else:
         logger.info("_retriever already exists")
 
     _current_doc_index = _doc_index
 
     logger.info("Starting similarity search")
-    max_docs = input_params['max_docs']
+    max_docs = input_params['retrieval']['max_docs']
     output_file_name = input_params['filename']
 
     source_documents = doc_index.similarity_search(decoded_question, k=max_docs)
+    logger.info(source_documents)
     # --------------------------------------------------------------------------
     # If an output file is specified, filter the response to only include chunks  
     # related to that file. The source metadata is added when embeddings are 
diff --git a/lambda/aws-qa-appsync-opensearch/question_answering/src/qa_agent/helper.py b/lambda/aws-qa-appsync-opensearch/question_answering/src/qa_agent/helper.py
@@ -90,7 +90,8 @@ def load_vector_db_opensearch(region: str,
                               opensearch_api_name: str,
                               opensearch_domain_endpoint: str,
                               opensearch_index: str,
-                              secret_id: str) -> OpenSearchVectorSearch:
+                              secret_id: str,
+                              model_id: str) -> OpenSearchVectorSearch:
     print(f"load_vector_db_opensearch, region={region}, "
                 f"opensearch_domain_endpoint={opensearch_domain_endpoint}, opensearch_index={opensearch_index}")
     
@@ -107,7 +108,7 @@ def load_vector_db_opensearch(region: str,
             opensearch_api_name,
             session_token=credentials.token,
         )
-    embedding_function = get_embeddings_llm()
+    embedding_function = get_embeddings_llm(model_id)
 
     opensearch_url = opensearch_domain_endpoint if opensearch_domain_endpoint.startswith("https://") else f"https://{opensearch_domain_endpoint}"
     vector_db = OpenSearchVectorSearch(index_name=opensearch_index,
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py b/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py
@@ -36,15 +36,15 @@ def __init__(self, bucket: str, image_file: str,image_detail_file: str):
         self.bucket = bucket
         self.image_file = image_file
         self.image_detail_file = image_detail_file
-       
+        print(f"load  image {image_file}, and image txt {image_detail_file} from :: {bucket}")
+
 
     
     @tracer.capture_method
     def load(self):
         """Load documents."""
         try:
             local_file_path = self.download_file(self.image_file)
-            print(f"file downloaded :: {local_file_path}")
             
             with open(f"{local_file_path}", "rb") as image_file:
                 input_image = base64.b64encode(image_file.read()).decode("utf8")
@@ -57,9 +57,9 @@ def load(self):
 
             docs = json.dumps({
                     "inputImage": input_image,
-                    #"inputText": raw_text,
+                    "inputText": raw_text,
                                })
-            #print(f'docs for titan embeddings {docs}')
+            print(f'raw_text for titan embeddings {raw_text}')
             return [Document(page_content=docs, metadata=metadata)]
             
         except Exception as exception:
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/update_ingestion_status.py b/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/update_ingestion_status.py
@@ -44,6 +44,7 @@ def updateIngestionJobStatus(variables):
                 files {
                     name
                     status
+                    imageurl
                 }
                 ingestionjobid
             }
@@ -54,6 +55,7 @@ def updateIngestionJobStatus(variables):
     query = query.replace("$files", str(variables['files']).replace("\'", "\""))
     query = query.replace("\"name\"", "name")
     query = query.replace("\"status\"", "status")
+    query = query.replace("\"imageurl\"", "imageurl")
 
     request = {'query':query}
 
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py b/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py
@@ -18,10 +18,8 @@
 import numpy as np
 import tempfile
 from helpers.credentials_helper import get_credentials
-from helpers.csv_loader import csv_loader
 from helpers.image_loader import image_loader
-from helpers.msdoc_loader import msdoc_loader
-from helpers.html_loader import html_loader
+
 
 from helpers.opensearch_helper import check_if_index_exists, process_shard, create_index_for_image
 from helpers.update_ingestion_status import updateIngestionJobStatus
@@ -198,17 +196,16 @@ def handler(event,  context: LambdaContext) -> dict:
                 for doc in sub_docs:
                     doc.metadata['source'] = filename
                 docs.extend(sub_docs)
-            if(extension == '.jpg' or extension == '.jpeg' or extension == '.png'):
+            if(extension == '.jpg' or extension == '.jpeg' or extension == '.png' or extension == '.svg'):
                 # Try adding text to document
                 #image_detal_file is created by aws rekognition
-                img_load = image_loader(bucket_name, f"{name}-resized.png",f"{name}.txt")
+                img_load = image_loader(bucket_name, f"{name}-resized{extension}",f"{name}.txt")
                 sub_docs = img_load.load()
                 for doc in sub_docs:
                     doc.metadata['source'] = filename
                 docs.extend(sub_docs)
                 url = img_load.get_presigned_url()
-                print(f" url set :: {url} ")
-                print(f" prepare os object ")
+                print(f" source :: {filename} ")
                 os_document = img_load.prepare_document_for_direct_load()
        
 
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/image_transformer.py b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/image_transformer.py
@@ -78,15 +78,17 @@ def check_moderation(self)-> str:
     @tracer.capture_method
     def detect_image_lables(self)-> str:
         try:
-            labels=[]
+            labels=''
             response = self.rekognition_client.detect_labels(Image=self.image,MaxLabels=20 )       
             for label in response['Labels']:
-                print(label)
+                name = label['Name']
                 if(label['Confidence'] > 0.80):
-                    labels.append(label['Name'])
+                    labels = labels + label['Name'] + ","
         except Exception as exp:
             print(f"Couldn't analyze image: {exp}")
         return labels
+    
+   
 
 
     @tracer.capture_method
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/pdf_transformer.py b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/pdf_transformer.py
@@ -0,0 +1,48 @@
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+# with the License. A copy of the License is located at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
+# and limitations under the License.
+#
+from typing import List
+
+from langchain.document_loaders.base import BaseLoader
+from helpers.s3inmemoryloader import S3FileLoaderInMemory
+
+from aws_lambda_powertools import Logger, Tracer
+from PyPDF2 import PdfReader
+from io import BytesIO
+
+
+logger = Logger(service="INGESTION_FILE_TRANSFORMER")
+tracer = Tracer(service="INGESTION_FILE_TRANSFORMER")
+
+@tracer.capture_method
+class pdf_transformer(BaseLoader):
+    """Transforming logic for pdf documents from s3 ."""
+
+    def __init__(self, bucket: str, key: str):
+        """Initialize with bucket and key name."""
+        self.bucket = bucket
+        self.key = key
+
+    def load(self) -> str:
+        """Load documents."""
+        try:
+            # TODO: add transformation logic
+            encodedpdf = S3FileLoaderInMemory(self.bucket, self.key).load
+            pdfFile = PdfReader(BytesIO(encodedpdf))        
+            raw_text = []
+            for page in pdfFile.pages:
+                raw_text.append(page.extract_text())
+            return '\n'.join(raw_text)    
+        except Exception as exception:
+            logger.exception(f"Reason: {exception}")
+            return ""
+        
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/utils.py b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/utils.py
@@ -5,7 +5,9 @@
 from aws_lambda_powertools import Logger, Tracer, Metrics
 from aws_lambda_powertools.metrics import MetricUnit
 from helpers.image_transformer import image_transformer
+from helpers.pdf_transformer import pdf_transformer
 from botocore.exceptions import ClientError
+from langchain_core.prompts import PromptTemplate
 
 
 
@@ -18,14 +20,22 @@
 
 @tracer.capture_method
 def isvalid_file_format(file_name: str) -> bool:
-    file_format = ['.pdf','.txt','.jpg','.png','.csv','.docx','.ppt','.html','.jpeg']
+    file_format = ['.pdf','.txt','.jpg','.png','.jpeg','.svg']
     if file_name.endswith(tuple(file_format)):
         return True
     else:
         print(f'Invalid file format :: {file_format}')
         return False
     
-
+@tracer.capture_method
+def transform_pdf_document(input_bucket: str,file_name: str,output_bucket: str,output_file_name:str):
+        document_content = pdf_transformer(input_bucket,file_name)
+        if not document_content:
+                return 'Unable to load document'             
+        else:
+                encoded_string = document_content.encode("utf-8")
+                s3.Bucket(output_bucket).put_object(Key=output_file_name, Body=encoded_string) 
+                return 'File transformed' 
 
 @tracer.capture_method
 def transform_image_document(input_bucket: str,file_name: str,output_bucket: str):  
@@ -40,19 +50,67 @@ def transform_image_document(input_bucket: str,file_name: str,output_bucket: str
                 image_details = {
                         "image_lables":result_lables,
                         "image_celeb":result_celeb
-                        }               
+                        }
+                               
                 name, extension = os.path.splitext(file_name)
+
+                lables_txt= convert_lables_to_sentence(result_lables)
+                # with open ('/tmp/'+name+'.txt','w') as f:
+                #         f.write(json.dumps(image_details))
+                # checking with senetence, save the senetence instead of lables
+               
                 with open ('/tmp/'+name+'.txt','w') as f:
-                        f.write(json.dumps(image_details))
+                         f.write(json.dumps(lables_txt))
+                
                 s3.upload_file('/tmp/'+name+'.txt',output_bucket,name+".txt")
                 downloaded_file = download_file(input_bucket,file_name)
                 print(f'downloaded_file:: {downloaded_file}')
                 
                 resize_image = imt.image_resize()
-                upload_file(output_bucket,resize_image)
+                upload_file(output_bucket,resize_image,file_name)
                 #upload_file(output_bucket,file_name)
                 return 'File transformed'
                
+
+@tracer.capture_method
+def convert_lables_to_sentence(labels_str)-> str:
+        try:
+            print(f"lables:: {labels_str}")
+            bedrock_client = boto3.client('bedrock-runtime')
+            
+            prompt ="""\n\nHuman: Here are the comma seperated list of labels seen in the image:
+                        <labels>
+                        {labels}
+                        </labels>
+                        Please provide a human readable and understandable summary based on these labels
+                        \n\nAssistant:"""
+            
+                    
+            prompt_template = PromptTemplate.from_template(prompt)
+            prompt_template_for_lables = prompt_template.format(labels=labels_str)
+            
+            body = json.dumps({"prompt": prompt_template_for_lables,
+                 "max_tokens_to_sample":300,
+                 "temperature":1,
+                 "top_k":250,
+                 "top_p":0.999,
+                 "stop_sequences":[]
+                  }) 
+            modelId = 'anthropic.claude-v2' 
+            accept = 'application/json'
+            contentType = 'application/json'
+
+            response = bedrock_client.invoke_model(body=body,
+                                                    modelId=modelId, accept=accept, contentType=contentType)
+            response_body = json.loads(response.get('body').read())
+            response_text_claud = response_body.get('completion')
+            print(f"response_text_claud:: {response_text_claud}")
+            return response_text_claud
+        except Exception as exp:
+            print(f"Couldn't convert lables to sentence: {exp}")
+        
+
+
 def download_file(bucket, object )-> str:
         try: 
             file_path = "/tmp/" + os.path.basename(object)
@@ -64,10 +122,10 @@ def download_file(bucket, object )-> str:
         except Exception as exp:
             print(f"Couldn\'t download file : {exp}")
 
-def upload_file(bucket, object )-> str:
+def upload_file(bucket, file_name,key )-> str:
         try: 
-            file_path = "/tmp/" + os.path.basename(object)
-            s3.upload_file(file_path, bucket,object)
+            file_path = "/tmp/" + os.path.basename(file_name)
+            s3.upload_file(file_path, bucket,key)
             return file_path
         except ClientError as client_err:
             print(f"Couldn\'t download file {client_err.response['Error']['Message']}")
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/lambda.py b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/lambda.py
@@ -19,7 +19,7 @@
 from aws_lambda_powertools import Logger, Tracer, Metrics
 from aws_lambda_powertools.utilities.typing import LambdaContext
 from aws_lambda_powertools.metrics import MetricUnit
-from helpers.utils import isvalid_file_format,transform_csv_document,transform_pdf_document,transform_msdoc_document_file,transform_image_document
+from helpers.utils import isvalid_file_format,transform_pdf_document,transform_image_document
 
 
 
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/requirements.txt b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/requirements.txt
@@ -2,7 +2,7 @@ aws-lambda-powertools
 aws-xray-sdk
 fastjsonschema
 typing-extensions
-boto3
+boto3>=1.34.29
 requests
 langchain==0.1.4
 pypdf2
diff --git a/src/patterns/gen-ai/aws-rag-appsync-stepfn-opensearch/index.ts b/src/patterns/gen-ai/aws-rag-appsync-stepfn-opensearch/index.ts
@@ -521,6 +521,14 @@ export class RagAppsyncStepfnOpensearch extends Construct {
       resources: ['*'],
     }));
 
+    s3_transformer_job_function_role.addToPolicy(new iam.PolicyStatement({
+      effect: iam.Effect.ALLOW,
+      actions: ['bedrock:*'],
+      resources: [
+        'arn:' + Aws.PARTITION + ':bedrock:' + Aws.REGION + '::foundation-model',
+        'arn:' + Aws.PARTITION + ':bedrock:' + Aws.REGION + '::foundation-model/*',
+      ],
+    }));
 
     s3_transformer_job_function_role.addToPolicy(
       new iam.PolicyStatement({

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,7 @@ def updateIngestionJobStatus(variables):`
`44`	`44`	`files {`
`45`	`45`	`name`
`46`	`46`	`status`
	`47`	`+ imageurl`
`47`	`48`	`}`
`48`	`49`	`ingestionjobid`
`49`	`50`	`}`
`@@ -54,6 +55,7 @@ def updateIngestionJobStatus(variables):`
`54`	`55`	`query = query.replace("$files", str(variables['files']).replace("\'", "\""))`
`55`	`56`	`query = query.replace("\"name\"", "name")`
`56`	`57`	`query = query.replace("\"status\"", "status")`
	`58`	`+ query = query.replace("\"imageurl\"", "imageurl")`
`57`	`59`
`58`	`60`	`request = {'query':query}`
`59`	`61`