Skip to content

Added scripts for bulk download and scrapping #11920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions scripts/bulkdownloadfromurls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import requests
from urllib.parse import urlsplit, quote_plus

Check failure on line 3 in scripts/bulkdownloadfromurls.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

scripts/bulkdownloadfromurls.py:3:26: F401 `urllib.parse.urlsplit` imported but unused
import pandas as pd


def read_urls_from_csv(csv_file, column_name):

Check failure on line 7 in scripts/bulkdownloadfromurls.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

scripts/bulkdownloadfromurls.py:1:1: I001 Import block is un-sorted or un-formatted
try:
# Read CSV file into a DataFrame
df = pd.read_csv(csv_file)

Check failure on line 10 in scripts/bulkdownloadfromurls.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (PD901)

scripts/bulkdownloadfromurls.py:10:9: PD901 Avoid using the generic variable name `df` for DataFrames

# Extract URLs from specified column
urls = df[column_name].tolist()

return urls
except Exception as e:

Check failure on line 16 in scripts/bulkdownloadfromurls.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (BLE001)

scripts/bulkdownloadfromurls.py:16:12: BLE001 Do not catch blind exception: `Exception`
print(f"Error reading URLs from CSV: {e}")
return []


def download_image(url, folder):
try:
# Send a GET request to the URL
response = requests.get(url, stream=True)

Check failure on line 24 in scripts/bulkdownloadfromurls.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (S113)

scripts/bulkdownloadfromurls.py:24:20: S113 Probable use of `requests` call without timeout
response.raise_for_status() # Check if the request was successful

# Generate a unique filename using the URL
filename = quote_plus(url) # Encode URL to use as filename
filename = filename[:25] # Limit filename length (optional)
filename = f"{filename}.jpg" # Add file extension if needed

# Create the output path
output_path = os.path.join(folder, filename)

# Save the image to the specified folder
with open(output_path, "wb") as file:
for chunk in response.iter_content(8192):
file.write(chunk)

print(f"Downloaded: {url} to {output_path}")
except requests.exceptions.RequestException as e:
print(f"Failed to download {url}: {e}")


def download_images_from_list(url_list, folder):
# Create the output folder if it doesn't exist
os.makedirs(folder, exist_ok=True)

for url in url_list:
download_image(url, folder)


if __name__ == "__main__":
# CSV file containing URLs
csv_file = "your_csv"
column_name = (
"YOUR COLUMN LINK CONTAING URLS" # Replace with the column name containing URLs
)

# Read URLs from CSV
image_urls = read_urls_from_csv(csv_file, column_name)

# Folder to save downloaded images
output_folder = "downloaded_images"

# Download images
download_images_from_list(image_urls, output_folder)
114 changes: 114 additions & 0 deletions scripts/simplescrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd


def scrape_categories(url):

Check failure on line 6 in scripts/simplescrapper.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

scripts/simplescrapper.py:1:1: I001 Import block is un-sorted or un-formatted
try:
# Send a GET request to the URL
response = requests.get(url)

Check failure on line 9 in scripts/simplescrapper.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (S113)

scripts/simplescrapper.py:9:20: S113 Probable use of `requests` call without timeout
response.raise_for_status() # Check for request errors

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all category elements (adjust based on the website structure)
categories = []
for category in soup.find_all(
"a", class_="category-link"
): # Adjust class or tag based on the website structure
category_name = category.text.strip()
category_url = category["href"]
categories.append({"Category": category_name, "URL": category_url})

return categories
except Exception as e:

Check failure on line 25 in scripts/simplescrapper.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (BLE001)

scripts/simplescrapper.py:25:12: BLE001 Do not catch blind exception: `Exception`
print(f"An error occurred while scraping categories: {e}")
return []


def scrape_products(category_url):
try:
# Send a GET request to the category URL
response = requests.get(category_url)

Check failure on line 33 in scripts/simplescrapper.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (S113)

scripts/simplescrapper.py:33:20: S113 Probable use of `requests` call without timeout
response.raise_for_status() # Check for request errors

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find all product elements (adjust based on the website structure)
products = []
for product in soup.find_all(
"div", class_="product-item"
): # Adjust class or tag based on the website structure
product_name = product.find(
"h2", class_="product-name"
).text.strip() # Adjust based on the website structure
product_price = product.find(
"span", class_="product-price"
).text.strip() # Adjust based on the website structure
product_url = product.find("a", class_="product-link")[
"href"
] # Adjust based on the website structure
products.append(
{
"Product Name": product_name,
"Price": product_price,
"URL": product_url,
}
)

return products
except Exception as e:

Check failure on line 62 in scripts/simplescrapper.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (BLE001)

scripts/simplescrapper.py:62:12: BLE001 Do not catch blind exception: `Exception`
print(f"An error occurred while scraping products: {e}")
return []


def save_to_excel(categories, output_file):
try:
# Create a DataFrame from the categories list
df = pd.DataFrame(categories)

# Save the DataFrame to an Excel file
df.to_excel(output_file, index=False)
print(f"Categories and products saved to {output_file}")
except Exception as e:
print(f"An error occurred while saving to Excel: {e}")


if __name__ == "__main__":
# Input: E-commerce website URL
url = input("Enter the e-commerce website URL: ").strip()

# Output: Excel file name
output_file = "categories_and_products.xlsx"

# Scrape categories
categories = scrape_categories(url)

# Scrape products for each category
for category in categories:
category_url = category["URL"]
products = scrape_products(category_url)
for product in products:
product["Category"] = category[
"Category"
] # Add category name to each product
category["Products"] = products

# Flatten the data for saving to Excel
all_data = []
for category in categories:
for product in category.get("Products", []):
all_data.append(
{
"Category": category["Category"],
"Product Name": product["Product Name"],
"Price": product["Price"],
"Product URL": product["URL"],
"Category URL": category["URL"],
}
)

# Save to Excel
save_to_excel(all_data, output_file)