1
-
2
1
import requests
3
2
from bs4 import BeautifulSoup
4
3
import pandas as pd
5
4
5
+
6
6
def scrape_categories (url ):
7
7
try :
8
8
# Send a GET request to the URL
9
9
response = requests .get (url )
10
10
response .raise_for_status () # Check for request errors
11
11
12
12
# Parse the HTML content using BeautifulSoup
13
- soup = BeautifulSoup (response .content , ' html.parser' )
13
+ soup = BeautifulSoup (response .content , " html.parser" )
14
14
15
15
# Find all category elements (adjust based on the website structure)
16
16
categories = []
17
- for category in soup .find_all ('a' , class_ = 'category-link' ): # Adjust class or tag based on the website structure
17
+ for category in soup .find_all (
18
+ "a" , class_ = "category-link"
19
+ ): # Adjust class or tag based on the website structure
18
20
category_name = category .text .strip ()
19
- category_url = category [' href' ]
20
- categories .append ({' Category' : category_name , ' URL' : category_url })
21
+ category_url = category [" href" ]
22
+ categories .append ({" Category" : category_name , " URL" : category_url })
21
23
22
24
return categories
23
25
except Exception as e :
24
26
print (f"An error occurred while scraping categories: { e } " )
25
27
return []
26
28
29
+
27
30
def scrape_products (category_url ):
28
31
try :
29
32
# Send a GET request to the category URL
30
33
response = requests .get (category_url )
31
34
response .raise_for_status () # Check for request errors
32
35
33
36
# Parse the HTML content using BeautifulSoup
34
- soup = BeautifulSoup (response .content , ' html.parser' )
37
+ soup = BeautifulSoup (response .content , " html.parser" )
35
38
36
39
# Find all product elements (adjust based on the website structure)
37
40
products = []
38
- for product in soup .find_all ('div' , class_ = 'product-item' ): # Adjust class or tag based on the website structure
39
- product_name = product .find ('h2' , class_ = 'product-name' ).text .strip () # Adjust based on the website structure
40
- product_price = product .find ('span' , class_ = 'product-price' ).text .strip () # Adjust based on the website structure
41
- product_url = product .find ('a' , class_ = 'product-link' )['href' ] # Adjust based on the website structure
42
- products .append ({'Product Name' : product_name , 'Price' : product_price , 'URL' : product_url })
41
+ for product in soup .find_all (
42
+ "div" , class_ = "product-item"
43
+ ): # Adjust class or tag based on the website structure
44
+ product_name = product .find (
45
+ "h2" , class_ = "product-name"
46
+ ).text .strip () # Adjust based on the website structure
47
+ product_price = product .find (
48
+ "span" , class_ = "product-price"
49
+ ).text .strip () # Adjust based on the website structure
50
+ product_url = product .find ("a" , class_ = "product-link" )[
51
+ "href"
52
+ ] # Adjust based on the website structure
53
+ products .append (
54
+ {
55
+ "Product Name" : product_name ,
56
+ "Price" : product_price ,
57
+ "URL" : product_url ,
58
+ }
59
+ )
43
60
44
61
return products
45
62
except Exception as e :
46
63
print (f"An error occurred while scraping products: { e } " )
47
64
return []
48
65
66
+
49
67
def save_to_excel (categories , output_file ):
50
68
try :
51
69
# Create a DataFrame from the categories list
@@ -57,35 +75,40 @@ def save_to_excel(categories, output_file):
57
75
except Exception as e :
58
76
print (f"An error occurred while saving to Excel: { e } " )
59
77
78
+
60
79
if __name__ == "__main__" :
61
80
# Input: E-commerce website URL
62
81
url = input ("Enter the e-commerce website URL: " ).strip ()
63
82
64
83
# Output: Excel file name
65
- output_file = ' categories_and_products.xlsx'
84
+ output_file = " categories_and_products.xlsx"
66
85
67
86
# Scrape categories
68
87
categories = scrape_categories (url )
69
88
70
89
# Scrape products for each category
71
90
for category in categories :
72
- category_url = category [' URL' ]
91
+ category_url = category [" URL" ]
73
92
products = scrape_products (category_url )
74
93
for product in products :
75
- product ['Category' ] = category ['Category' ] # Add category name to each product
76
- category ['Products' ] = products
94
+ product ["Category" ] = category [
95
+ "Category"
96
+ ] # Add category name to each product
97
+ category ["Products" ] = products
77
98
78
99
# Flatten the data for saving to Excel
79
100
all_data = []
80
101
for category in categories :
81
- for product in category .get ('Products' , []):
82
- all_data .append ({
83
- 'Category' : category ['Category' ],
84
- 'Product Name' : product ['Product Name' ],
85
- 'Price' : product ['Price' ],
86
- 'Product URL' : product ['URL' ],
87
- 'Category URL' : category ['URL' ]
88
- })
102
+ for product in category .get ("Products" , []):
103
+ all_data .append (
104
+ {
105
+ "Category" : category ["Category" ],
106
+ "Product Name" : product ["Product Name" ],
107
+ "Price" : product ["Price" ],
108
+ "Product URL" : product ["URL" ],
109
+ "Category URL" : category ["URL" ],
110
+ }
111
+ )
89
112
90
113
# Save to Excel
91
114
save_to_excel (all_data , output_file )
0 commit comments