Skip to content

Commit 9047fde

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent e3e4849 commit 9047fde

File tree

2 files changed

+62
-35
lines changed

2 files changed

+62
-35
lines changed

scripts/bulkdownloadfromurls.py

+16-12
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
from urllib.parse import urlsplit, quote_plus
44
import pandas as pd
55

6+
67
def read_urls_from_csv(csv_file, column_name):
78
try:
89
# Read CSV file into a DataFrame
910
df = pd.read_csv(csv_file)
10-
11+
1112
# Extract URLs from specified column
1213
urls = df[column_name].tolist()
13-
14+
1415
return urls
1516
except Exception as e:
1617
print(f"Error reading URLs from CSV: {e}")
@@ -22,42 +23,45 @@ def download_image(url, folder):
2223
# Send a GET request to the URL
2324
response = requests.get(url, stream=True)
2425
response.raise_for_status() # Check if the request was successful
25-
26+
2627
# Generate a unique filename using the URL
2728
filename = quote_plus(url) # Encode URL to use as filename
2829
filename = filename[:25] # Limit filename length (optional)
2930
filename = f"{filename}.jpg" # Add file extension if needed
30-
31+
3132
# Create the output path
3233
output_path = os.path.join(folder, filename)
33-
34+
3435
# Save the image to the specified folder
35-
with open(output_path, 'wb') as file:
36+
with open(output_path, "wb") as file:
3637
for chunk in response.iter_content(8192):
3738
file.write(chunk)
38-
39+
3940
print(f"Downloaded: {url} to {output_path}")
4041
except requests.exceptions.RequestException as e:
4142
print(f"Failed to download {url}: {e}")
4243

44+
4345
def download_images_from_list(url_list, folder):
4446
# Create the output folder if it doesn't exist
4547
os.makedirs(folder, exist_ok=True)
46-
48+
4749
for url in url_list:
4850
download_image(url, folder)
4951

5052

5153
if __name__ == "__main__":
5254
# CSV file containing URLs
5355
csv_file = "your_csv"
54-
column_name = "YOUR COLUMN LINK CONTAING URLS" # Replace with the column name containing URLs
55-
56+
column_name = (
57+
"YOUR COLUMN LINK CONTAING URLS" # Replace with the column name containing URLs
58+
)
59+
5660
# Read URLs from CSV
5761
image_urls = read_urls_from_csv(csv_file, column_name)
58-
62+
5963
# Folder to save downloaded images
6064
output_folder = "downloaded_images"
61-
65+
6266
# Download images
6367
download_images_from_list(image_urls, output_folder)

scripts/simplescrapper.py

+46-23
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,69 @@
1-
21
import requests
32
from bs4 import BeautifulSoup
43
import pandas as pd
54

5+
66
def scrape_categories(url):
77
try:
88
# Send a GET request to the URL
99
response = requests.get(url)
1010
response.raise_for_status() # Check for request errors
1111

1212
# Parse the HTML content using BeautifulSoup
13-
soup = BeautifulSoup(response.content, 'html.parser')
13+
soup = BeautifulSoup(response.content, "html.parser")
1414

1515
# Find all category elements (adjust based on the website structure)
1616
categories = []
17-
for category in soup.find_all('a', class_='category-link'): # Adjust class or tag based on the website structure
17+
for category in soup.find_all(
18+
"a", class_="category-link"
19+
): # Adjust class or tag based on the website structure
1820
category_name = category.text.strip()
19-
category_url = category['href']
20-
categories.append({'Category': category_name, 'URL': category_url})
21+
category_url = category["href"]
22+
categories.append({"Category": category_name, "URL": category_url})
2123

2224
return categories
2325
except Exception as e:
2426
print(f"An error occurred while scraping categories: {e}")
2527
return []
2628

29+
2730
def scrape_products(category_url):
2831
try:
2932
# Send a GET request to the category URL
3033
response = requests.get(category_url)
3134
response.raise_for_status() # Check for request errors
3235

3336
# Parse the HTML content using BeautifulSoup
34-
soup = BeautifulSoup(response.content, 'html.parser')
37+
soup = BeautifulSoup(response.content, "html.parser")
3538

3639
# Find all product elements (adjust based on the website structure)
3740
products = []
38-
for product in soup.find_all('div', class_='product-item'): # Adjust class or tag based on the website structure
39-
product_name = product.find('h2', class_='product-name').text.strip() # Adjust based on the website structure
40-
product_price = product.find('span', class_='product-price').text.strip() # Adjust based on the website structure
41-
product_url = product.find('a', class_='product-link')['href'] # Adjust based on the website structure
42-
products.append({'Product Name': product_name, 'Price': product_price, 'URL': product_url})
41+
for product in soup.find_all(
42+
"div", class_="product-item"
43+
): # Adjust class or tag based on the website structure
44+
product_name = product.find(
45+
"h2", class_="product-name"
46+
).text.strip() # Adjust based on the website structure
47+
product_price = product.find(
48+
"span", class_="product-price"
49+
).text.strip() # Adjust based on the website structure
50+
product_url = product.find("a", class_="product-link")[
51+
"href"
52+
] # Adjust based on the website structure
53+
products.append(
54+
{
55+
"Product Name": product_name,
56+
"Price": product_price,
57+
"URL": product_url,
58+
}
59+
)
4360

4461
return products
4562
except Exception as e:
4663
print(f"An error occurred while scraping products: {e}")
4764
return []
4865

66+
4967
def save_to_excel(categories, output_file):
5068
try:
5169
# Create a DataFrame from the categories list
@@ -57,35 +75,40 @@ def save_to_excel(categories, output_file):
5775
except Exception as e:
5876
print(f"An error occurred while saving to Excel: {e}")
5977

78+
6079
if __name__ == "__main__":
6180
# Input: E-commerce website URL
6281
url = input("Enter the e-commerce website URL: ").strip()
6382

6483
# Output: Excel file name
65-
output_file = 'categories_and_products.xlsx'
84+
output_file = "categories_and_products.xlsx"
6685

6786
# Scrape categories
6887
categories = scrape_categories(url)
6988

7089
# Scrape products for each category
7190
for category in categories:
72-
category_url = category['URL']
91+
category_url = category["URL"]
7392
products = scrape_products(category_url)
7493
for product in products:
75-
product['Category'] = category['Category'] # Add category name to each product
76-
category['Products'] = products
94+
product["Category"] = category[
95+
"Category"
96+
] # Add category name to each product
97+
category["Products"] = products
7798

7899
# Flatten the data for saving to Excel
79100
all_data = []
80101
for category in categories:
81-
for product in category.get('Products', []):
82-
all_data.append({
83-
'Category': category['Category'],
84-
'Product Name': product['Product Name'],
85-
'Price': product['Price'],
86-
'Product URL': product['URL'],
87-
'Category URL': category['URL']
88-
})
102+
for product in category.get("Products", []):
103+
all_data.append(
104+
{
105+
"Category": category["Category"],
106+
"Product Name": product["Product Name"],
107+
"Price": product["Price"],
108+
"Product URL": product["URL"],
109+
"Category URL": category["URL"],
110+
}
111+
)
89112

90113
# Save to Excel
91114
save_to_excel(all_data, output_file)

0 commit comments

Comments
 (0)