Skip to content

Commit 10247ba

Browse files
author
Dinesh Sajwan
committed
feat(coderefactoring): code clean up
1 parent 0698e49 commit 10247ba

File tree

7 files changed

+38
-66
lines changed

7 files changed

+38
-66
lines changed

lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/image_loader.py

+9-14
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,21 @@
1010
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
1111
# and limitations under the License.
1212
#
13-
import base64
14-
import json
13+
1514
import os
1615
import time
16+
import base64
17+
import json
18+
import numpy as np
19+
20+
from pathlib import Path
1721
from typing import List
1822
from aiohttp import ClientError
19-
from pathlib import Path
20-
import numpy as np
23+
2124

2225

2326

2427
from aws_lambda_powertools import Logger, Tracer
25-
#from langchain_community.document_loaders.image import UnstructuredImageLoader
2628
from langchain.docstore.document import Document
2729

2830
import boto3
@@ -88,21 +90,14 @@ def load(self):
8890
"""Load documents."""
8991
try:
9092
local_file_path = self.download_file(self.image_file)
91-
92-
# with open(f"{local_file_path}", "rb") as image_file:
93-
# input_image = base64.b64encode(image_file.read()).decode("utf8")
94-
93+
9594
b64_image_file_path = self.encode_image_to_base64(local_file_path,self.image_file)
9695
print(f'b64_image_file :: {b64_image_file_path}')
9796

9897
with open(b64_image_file_path, "rb") as b64_image_file:
9998
input_image_b64 = b64_image_file.read().decode('utf-8')
10099

101-
#embeddings=self.get_image_embeddings(input_image_b64,self.modelid)
102-
103-
# if embeddings is None:
104-
# logger.error(f"error creating multimodal embeddings for {self.image_file}")
105-
100+
106101
obj = s3_client.get_object(Bucket=self.bucket, Key=self.image_detail_file)
107102
raw_text = obj['Body'].read().decode('utf-8')
108103

lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/helpers/opensearch_helper.py

-6
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,6 @@ def check_if_index_exists(index_name: str, region: str, host: str, http_auth: Tu
4141

4242
def process_shard(shard, os_index_name, os_domain_ep, os_http_auth,model_id) -> int:
4343
bedrock_client = boto3.client('bedrock-runtime')
44-
45-
# if(model_id=='amazon.titan-embed-image-v1'):
46-
# print(f' save image embeddings in OS')
47-
# embeddings = image_loader.BedrockEmbeddings_image(docs=shard, model_id=model_id,)
48-
# else:
49-
# embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
5044
embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
5145

5246
opensearch_url = os_domain_ep if os_domain_ep.startswith("https://") else f"https://{os_domain_ep}"

lambda/aws-rag-appsync-stepfn-opensearch/embeddings_job/src/lambda.py

+3-12
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,8 @@ def process_documents_in_es(index_exists, shards, http_auth,model_id):
109109
os_http_auth=http_auth)
110110

111111
def process_documents_in_aoss(index_exists, shards, http_auth,model_id):
112-
# Reference: https://python.langchain.com/docs/integrations/vectorstores/opensearch#using-aoss-amazon-opensearch-service-serverless
113112
bedrock_client = boto3.client('bedrock-runtime')
114-
# if(model_id=='amazon.titan-embed-image-v1'):
115-
# print(f'image embeddings shards[0] {shards}')
116-
# embeddings = image_loader.BedrockEmbeddings_image(docs=shards[0], model_id=model_id,)
117-
# else:
118-
# embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
119113
embeddings = BedrockEmbeddings(client=bedrock_client,model_id=model_id)
120-
121-
print(f' check index with :: {shards[0]}')
122114

123115
shard_start_index = 0
124116
if index_exists is False:
@@ -166,7 +158,7 @@ def handler(event, context: LambdaContext) -> dict:
166158
job_id = event[0]['s3_transformer_result']['Payload']['jobid']
167159
modelid = event[0]['s3_transformer_result']['Payload']['modelid']
168160

169-
print(f' model id :: {modelid}')
161+
logger.info(f' model id :: {modelid}')
170162

171163
logger.set_correlation_id(job_id)
172164
metrics.add_metadata(key='correlationId', value=job_id)
@@ -181,7 +173,7 @@ def handler(event, context: LambdaContext) -> dict:
181173

182174

183175

184-
print(f'Loading txt raw files from {bucket_name}')
176+
logger.info(f'Loading txt raw files from {bucket_name}')
185177

186178
docs = []
187179

@@ -228,7 +220,6 @@ def process_text_embeddings(docs,modelid,http_auth,files,job_id):
228220
# we can augment data here probably (PII present ? ...)
229221
for doc in docs:
230222
doc.metadata['timestamp'] = time.time()
231-
# doc.metadata['embeddings_model'] = 'amazon.titan-embed-text-v1'
232223
doc.metadata['embeddings_model'] = modelid
233224
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
234225

@@ -276,7 +267,7 @@ def process_image_embeddings(docs,modelid,http_auth,files,job_id,url):
276267
for doc in docs:
277268
doc.metadata['timestamp'] = time.time()
278269
doc.metadata['embeddings_model'] = modelid
279-
270+
# not using text splitter , using whole image embedding as one array
280271
shards = np.array_split(docs,1)
281272

282273
try:

lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/image_transformer.py

+5-18
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@
1010
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
1111
# and limitations under the License.
1212
#
13-
from typing import List
14-
import boto3
1513
import os
16-
from aws_lambda_powertools import Logger, Tracer
14+
import boto3
15+
16+
1717
from PIL import Image
18+
from typing import List
19+
from aws_lambda_powertools import Logger, Tracer
1820

1921

2022

@@ -49,16 +51,6 @@ def from_file(cls, image_file_name, rekognition_client, file_name=None):
4951
name = image_file_name if file_name is None else file_name
5052
return cls(image_bytes, name, rekognition_client)
5153

52-
# @tracer.capture_method
53-
# def load(self) -> str:
54-
# """Load documents."""
55-
# try:
56-
# # TODO add transformation logic
57-
# print(f"No transformation logic implemented, copy the file {self.key} to processed bucket")
58-
# except Exception as exception:
59-
# logger.exception(f"Reason: {exception}")
60-
# return ""
61-
6254

6355
@tracer.capture_method
6456
def check_moderation(self)-> str:
@@ -89,8 +81,6 @@ def detect_image_lables(self)-> str:
8981
return labels
9082

9183

92-
93-
9484
@tracer.capture_method
9585
def recognize_celebrities(self)-> str:
9686
try:
@@ -99,9 +89,6 @@ def recognize_celebrities(self)-> str:
9989
print(f'Detected faces for :: { response}')
10090
for celebrity in response['CelebrityFaces']:
10191
celebrities.append(celebrity['Name'])
102-
# for face in response['UnrecognizedFaces']:
103-
# celebrities.append(face['UnrecognizedFaces'])
104-
10592
except Exception as exp:
10693
print(f"Couldn't analyze image: {exp}")
10794

lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/helpers/utils.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,27 @@
1+
#
2+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
5+
# with the License. A copy of the License is located at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
10+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
11+
# and limitations under the License.
12+
#
113

2-
import boto3
314
import os
15+
import boto3
416
import json
5-
from aws_lambda_powertools import Logger, Tracer, Metrics
17+
from botocore.exceptions import ClientError
618
from aws_lambda_powertools.metrics import MetricUnit
7-
from helpers.image_transformer import image_transformer
19+
820
from helpers.pdf_transformer import pdf_transformer
9-
from botocore.exceptions import ClientError
21+
from helpers.image_transformer import image_transformer
1022
from langchain_core.prompts import PromptTemplate
23+
from aws_lambda_powertools import Logger, Tracer, Metrics
24+
1125

1226

1327

@@ -46,11 +60,6 @@ def transform_image_document(input_bucket: str,file_name: str,output_bucket: str
4660
return 'Image not supported'
4761
else:
4862
result_lables = imt_object.detect_image_lables()
49-
# result_celeb = imt_object.recognize_celebrities()
50-
# image_details = {
51-
# "image_lables":result_lables,
52-
# "image_celeb":result_celeb
53-
# }
5463

5564
name, extension = os.path.splitext(file_name)
5665
lables_txt= convert_lables_to_sentence(result_lables)

lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/lambda.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
#
1313
import os
1414
import boto3
15-
#from helpers.unstructured_s3_connector import run_s3_Connector
16-
1715

1816

1917
from aws_lambda_powertools import Logger, Tracer, Metrics
@@ -33,8 +31,6 @@
3331
input_bucket = os.environ['INPUT_BUCKET']
3432
output_bucket = os.environ['OUTPUT_BUCKET']
3533

36-
# input_bucket = "persistencestack-inputassets7d1d3f52-qert2sgpwhtu"
37-
# output_bucket = "persistencestack-processedassets6ba25f4c-zmebuvdaelig"
3834

3935
@tracer.capture_method
4036
def file_exists_in_bucket(bucket_name, object_name,):
@@ -94,7 +90,7 @@ def handler(event, context: LambdaContext) -> dict:
9490
print(f' pdf processed ::' )
9591
elif(extension == '.jpg'or extension == '.jpeg' or extension == '.png' or extension == '.svg'):
9692
response['status'] = transform_image_document(input_bucket,file_name,output_bucket)
97-
#TODO add csv, doc, docx file type suport as well.
93+
#TODO add csv, doc, docx file type support as well.
9894
else:
9995
response['status'] = 'File Not transformed'
10096
else:

lambda/aws-rag-appsync-stepfn-opensearch/s3_file_transformer/src/requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ typing-extensions
55
boto3>=1.34.29
66
requests
77
langchain==0.1.4
8-
pypdf2
9-
Pillow
8+
pypdf2==4.0.2
9+
Pillow==10.2.0
1010
langchain-community==0.0.16

0 commit comments

Comments
 (0)