|
| 1 | + |
| 2 | +import requests |
| 3 | +from bs4 import BeautifulSoup |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +def scrape_categories(url): |
| 7 | + try: |
| 8 | + # Send a GET request to the URL |
| 9 | + response = requests.get(url) |
| 10 | + response.raise_for_status() # Check for request errors |
| 11 | + |
| 12 | + # Parse the HTML content using BeautifulSoup |
| 13 | + soup = BeautifulSoup(response.content, 'html.parser') |
| 14 | + |
| 15 | + # Find all category elements (adjust based on the website structure) |
| 16 | + categories = [] |
| 17 | + for category in soup.find_all('a', class_='category-link'): # Adjust class or tag based on the website structure |
| 18 | + category_name = category.text.strip() |
| 19 | + category_url = category['href'] |
| 20 | + categories.append({'Category': category_name, 'URL': category_url}) |
| 21 | + |
| 22 | + return categories |
| 23 | + except Exception as e: |
| 24 | + print(f"An error occurred while scraping categories: {e}") |
| 25 | + return [] |
| 26 | + |
| 27 | +def scrape_products(category_url): |
| 28 | + try: |
| 29 | + # Send a GET request to the category URL |
| 30 | + response = requests.get(category_url) |
| 31 | + response.raise_for_status() # Check for request errors |
| 32 | + |
| 33 | + # Parse the HTML content using BeautifulSoup |
| 34 | + soup = BeautifulSoup(response.content, 'html.parser') |
| 35 | + |
| 36 | + # Find all product elements (adjust based on the website structure) |
| 37 | + products = [] |
| 38 | + for product in soup.find_all('div', class_='product-item'): # Adjust class or tag based on the website structure |
| 39 | + product_name = product.find('h2', class_='product-name').text.strip() # Adjust based on the website structure |
| 40 | + product_price = product.find('span', class_='product-price').text.strip() # Adjust based on the website structure |
| 41 | + product_url = product.find('a', class_='product-link')['href'] # Adjust based on the website structure |
| 42 | + products.append({'Product Name': product_name, 'Price': product_price, 'URL': product_url}) |
| 43 | + |
| 44 | + return products |
| 45 | + except Exception as e: |
| 46 | + print(f"An error occurred while scraping products: {e}") |
| 47 | + return [] |
| 48 | + |
| 49 | +def save_to_excel(categories, output_file): |
| 50 | + try: |
| 51 | + # Create a DataFrame from the categories list |
| 52 | + df = pd.DataFrame(categories) |
| 53 | + |
| 54 | + # Save the DataFrame to an Excel file |
| 55 | + df.to_excel(output_file, index=False) |
| 56 | + print(f"Categories and products saved to {output_file}") |
| 57 | + except Exception as e: |
| 58 | + print(f"An error occurred while saving to Excel: {e}") |
| 59 | + |
| 60 | +if __name__ == "__main__": |
| 61 | + # Input: E-commerce website URL |
| 62 | + url = input("Enter the e-commerce website URL: ").strip() |
| 63 | + |
| 64 | + # Output: Excel file name |
| 65 | + output_file = 'categories_and_products.xlsx' |
| 66 | + |
| 67 | + # Scrape categories |
| 68 | + categories = scrape_categories(url) |
| 69 | + |
| 70 | + # Scrape products for each category |
| 71 | + for category in categories: |
| 72 | + category_url = category['URL'] |
| 73 | + products = scrape_products(category_url) |
| 74 | + for product in products: |
| 75 | + product['Category'] = category['Category'] # Add category name to each product |
| 76 | + category['Products'] = products |
| 77 | + |
| 78 | + # Flatten the data for saving to Excel |
| 79 | + all_data = [] |
| 80 | + for category in categories: |
| 81 | + for product in category.get('Products', []): |
| 82 | + all_data.append({ |
| 83 | + 'Category': category['Category'], |
| 84 | + 'Product Name': product['Product Name'], |
| 85 | + 'Price': product['Price'], |
| 86 | + 'Product URL': product['URL'], |
| 87 | + 'Category URL': category['URL'] |
| 88 | + }) |
| 89 | + |
| 90 | + # Save to Excel |
| 91 | + save_to_excel(all_data, output_file) |
0 commit comments