forked from TheAlgorithms/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimplescrapper.py
114 lines (94 loc) · 3.78 KB
/
simplescrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_categories(url):
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Check for request errors
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Find all category elements (adjust based on the website structure)
categories = []
for category in soup.find_all(
"a", class_="category-link"
): # Adjust class or tag based on the website structure
category_name = category.text.strip()
category_url = category["href"]
categories.append({"Category": category_name, "URL": category_url})
return categories
except Exception as e:
print(f"An error occurred while scraping categories: {e}")
return []
def scrape_products(category_url):
try:
# Send a GET request to the category URL
response = requests.get(category_url)
response.raise_for_status() # Check for request errors
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Find all product elements (adjust based on the website structure)
products = []
for product in soup.find_all(
"div", class_="product-item"
): # Adjust class or tag based on the website structure
product_name = product.find(
"h2", class_="product-name"
).text.strip() # Adjust based on the website structure
product_price = product.find(
"span", class_="product-price"
).text.strip() # Adjust based on the website structure
product_url = product.find("a", class_="product-link")[
"href"
] # Adjust based on the website structure
products.append(
{
"Product Name": product_name,
"Price": product_price,
"URL": product_url,
}
)
return products
except Exception as e:
print(f"An error occurred while scraping products: {e}")
return []
def save_to_excel(categories, output_file):
try:
# Create a DataFrame from the categories list
df = pd.DataFrame(categories)
# Save the DataFrame to an Excel file
df.to_excel(output_file, index=False)
print(f"Categories and products saved to {output_file}")
except Exception as e:
print(f"An error occurred while saving to Excel: {e}")
if __name__ == "__main__":
# Input: E-commerce website URL
url = input("Enter the e-commerce website URL: ").strip()
# Output: Excel file name
output_file = "categories_and_products.xlsx"
# Scrape categories
categories = scrape_categories(url)
# Scrape products for each category
for category in categories:
category_url = category["URL"]
products = scrape_products(category_url)
for product in products:
product["Category"] = category[
"Category"
] # Add category name to each product
category["Products"] = products
# Flatten the data for saving to Excel
all_data = []
for category in categories:
for product in category.get("Products", []):
all_data.append(
{
"Category": category["Category"],
"Product Name": product["Product Name"],
"Price": product["Price"],
"Product URL": product["URL"],
"Category URL": category["URL"],
}
)
# Save to Excel
save_to_excel(all_data, output_file)