feat(coderefactoring): code clean up

Dinesh Sajwan · Dinesh Sajwan · commit 10247bae482b · 2024-02-29T00:58:09.000-05:00
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py b/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py
@@ -10,19 +10,21 @@
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
 # and limitations under the License.
 #
-import base64
-import json
+
 import os
 import time
+import base64
+import json
+import numpy as np
+
+from pathlib import Path
 from typing import List
 from aiohttp import ClientError
-from pathlib import Path
-import numpy as np
+
 
 
 
 from aws_lambda_powertools import Logger, Tracer
-#from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain.docstore.document import Document
 
 import boto3
@@ -88,21 +90,14 @@ def load(self):
         """Load documents."""
         try:
             local_file_path = self.download_file(self.image_file)
-            
-            # with open(f"{local_file_path}", "rb") as image_file:
-            #     input_image = base64.b64encode(image_file.read()).decode("utf8")
-            
+           
             b64_image_file_path = self.encode_image_to_base64(local_file_path,self.image_file)
             print(f'b64_image_file :: {b64_image_file_path}')
             
             with open(b64_image_file_path, "rb") as b64_image_file:
                 input_image_b64 = b64_image_file.read().decode('utf-8')
 
-            #embeddings=self.get_image_embeddings(input_image_b64,self.modelid)
-
-            # if embeddings is None:
-            #     logger.error(f"error creating multimodal embeddings for {self.image_file}")
-
+            
             obj = s3_client.get_object(Bucket=self.bucket, Key=self.image_detail_file)
             raw_text = obj['Body'].read().decode('utf-8') 
 
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py b/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py
@@ -41,12 +41,6 @@ def check_if_index_exists(index_name: str, region: str, host: str, http_auth: Tu
 
 def process_shard(shard, os_index_name, os_domain_ep, os_http_auth,model_id) -> int: 
     bedrock_client = boto3.client('bedrock-runtime')
-
-    # if(model_id=='amazon.titan-embed-image-v1'):
-    #     print(f' save image embeddings in OS')
-    #     embeddings = image_loader.BedrockEmbeddings_image(docs=shard, model_id=model_id,)
-    # else:
-    #     embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
     embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
    
     opensearch_url = os_domain_ep if os_domain_ep.startswith("https://") else f"https://{os_domain_ep}"
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py b/lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py
@@ -109,16 +109,8 @@ def process_documents_in_es(index_exists, shards, http_auth,model_id):
                     os_http_auth=http_auth)
 
 def process_documents_in_aoss(index_exists, shards, http_auth,model_id):
-    # Reference: https://python.langchain.com/docs/integrations/vectorstores/opensearch#using-aoss-amazon-opensearch-service-serverless
     bedrock_client = boto3.client('bedrock-runtime')
-    # if(model_id=='amazon.titan-embed-image-v1'):
-    #     print(f'image embeddings shards[0] {shards}')
-    #     embeddings = image_loader.BedrockEmbeddings_image(docs=shards[0], model_id=model_id,)
-    # else:
-    #     embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
     embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
-
-    print(f' check index with :: {shards[0]}')
    
     shard_start_index = 0
     if index_exists is False:
@@ -166,7 +158,7 @@ def handler(event,  context: LambdaContext) -> dict:
     job_id = event[0]['s3_transformer_result']['Payload']['jobid']
     modelid = event[0]['s3_transformer_result']['Payload']['modelid']
 
-    print(f' model id :: {modelid}')
+    logger.info(f' model id :: {modelid}')
 
     logger.set_correlation_id(job_id)
     metrics.add_metadata(key='correlationId', value=job_id)
@@ -181,7 +173,7 @@ def handler(event,  context: LambdaContext) -> dict:
 
     
     
-    print(f'Loading txt raw files from {bucket_name}')
+    logger.info(f'Loading txt raw files from {bucket_name}')
 
     docs = []
     
@@ -228,7 +220,6 @@ def process_text_embeddings(docs,modelid,http_auth,files,job_id):
     # we can augment data here probably (PII present ? ...)
     for doc in docs:
         doc.metadata['timestamp'] = time.time()
-       # doc.metadata['embeddings_model'] = 'amazon.titan-embed-text-v1'
         doc.metadata['embeddings_model'] = modelid
     chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
 
@@ -276,7 +267,7 @@ def process_image_embeddings(docs,modelid,http_auth,files,job_id,url):
     for doc in docs:
         doc.metadata['timestamp'] = time.time()
         doc.metadata['embeddings_model'] = modelid
-   
+    # not using text splitter , using whole image embedding as one array
     shards = np.array_split(docs,1)
 
     try:
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/image_transformer.py b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/image_transformer.py
@@ -10,11 +10,13 @@
 # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
 # and limitations under the License.
 #
-from typing import List
-import boto3
 import os
-from aws_lambda_powertools import Logger, Tracer
+import boto3
+
+
 from PIL import Image
+from typing import List
+from aws_lambda_powertools import Logger, Tracer
 
 
 
@@ -49,16 +51,6 @@ def from_file(cls, image_file_name, rekognition_client, file_name=None):
         name = image_file_name if file_name is None else file_name
         return cls(image_bytes, name, rekognition_client)
     
-    # @tracer.capture_method
-    # def load(self) -> str:
-    #     """Load documents."""
-    #     try:
-    #         # TODO add transformation logic
-    #         print(f"No transformation logic implemented, copy the file {self.key} to processed bucket")        
-    #     except Exception as exception:
-    #         logger.exception(f"Reason: {exception}")
-    #         return ""
-        
 
     @tracer.capture_method
     def check_moderation(self)-> str:
@@ -89,8 +81,6 @@ def detect_image_lables(self)-> str:
         return labels
     
    
-
-
     @tracer.capture_method
     def recognize_celebrities(self)-> str:
         try:
@@ -99,9 +89,6 @@ def recognize_celebrities(self)-> str:
             print(f'Detected faces for :: { response}')
             for celebrity in response['CelebrityFaces']:
                 celebrities.append(celebrity['Name'])
-            # for face in response['UnrecognizedFaces']:
-            #     celebrities.append(face['UnrecognizedFaces'])
-            
         except Exception as exp:
             print(f"Couldn't analyze image: {exp}")
             
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/utils.py b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/utils.py
@@ -1,13 +1,27 @@
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+# with the License. A copy of the License is located at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
+# and limitations under the License.
+#
 
-import boto3
 import os
+import boto3
 import json
-from aws_lambda_powertools import Logger, Tracer, Metrics
+from botocore.exceptions import ClientError
 from aws_lambda_powertools.metrics import MetricUnit
-from helpers.image_transformer import image_transformer
+
 from helpers.pdf_transformer import pdf_transformer
-from botocore.exceptions import ClientError
+from helpers.image_transformer import image_transformer
 from langchain_core.prompts import PromptTemplate
+from aws_lambda_powertools import Logger, Tracer, Metrics
+
 
 
 
@@ -46,11 +60,6 @@ def transform_image_document(input_bucket: str,file_name: str,output_bucket: str
                 return 'Image not supported'
         else:
                 result_lables = imt_object.detect_image_lables()
-                # result_celeb =  imt_object.recognize_celebrities()
-                # image_details = {
-                #         "image_lables":result_lables,
-                #         "image_celeb":result_celeb
-                #         }
                                
                 name, extension = os.path.splitext(file_name)
                 lables_txt= convert_lables_to_sentence(result_lables)            
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/lambda.py b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/lambda.py
@@ -12,8 +12,6 @@
 #
 import os
 import boto3
-#from helpers.unstructured_s3_connector import run_s3_Connector
-
 
 
 from aws_lambda_powertools import Logger, Tracer, Metrics
@@ -33,8 +31,6 @@
 input_bucket = os.environ['INPUT_BUCKET']
 output_bucket = os.environ['OUTPUT_BUCKET']
 
-# input_bucket = "persistencestack-inputassets7d1d3f52-qert2sgpwhtu"
-# output_bucket = "persistencestack-processedassets6ba25f4c-zmebuvdaelig"
 
 @tracer.capture_method
 def file_exists_in_bucket(bucket_name, object_name,):
@@ -94,7 +90,7 @@ def handler(event,  context: LambdaContext) -> dict:
                 print(f' pdf processed ::' )
             elif(extension == '.jpg'or extension == '.jpeg' or extension == '.png' or extension == '.svg'):
                 response['status'] = transform_image_document(input_bucket,file_name,output_bucket)
-            #TODO add csv, doc, docx file type suport as well.
+            #TODO add csv, doc, docx file type support as well.
             else:
                 response['status'] = 'File Not transformed'            
         else:
diff --git a/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/requirements.txt b/lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/requirements.txt
@@ -5,6 +5,6 @@ typing-extensions
 boto3>=1.34.29
 requests
 langchain==0.1.4
-pypdf2
-Pillow
+pypdf2==4.0.2
+Pillow==10.2.0
 langchain-community==0.0.16