|
1 | 1 | import os
|
2 | 2 | import requests
|
3 |
| -from urllib.parse import urlsplit, quote_plus |
| 3 | +from urllib.parse import quote_plus |
4 | 4 | import pandas as pd
|
5 | 5 |
|
6 |
| - |
7 | 6 | def read_urls_from_csv(csv_file, column_name):
|
8 | 7 | try:
|
9 | 8 | # Read CSV file into a DataFrame
|
10 |
| - df = pd.read_csv(csv_file) |
11 |
| - |
| 9 | + image_data = pd.read_csv(csv_file) |
| 10 | + |
12 | 11 | # Extract URLs from specified column
|
13 |
| - urls = df[column_name].tolist() |
14 |
| - |
| 12 | + urls = image_data[column_name].tolist() |
| 13 | + |
15 | 14 | return urls
|
16 |
| - except Exception as e: |
17 |
| - print(f"Error reading URLs from CSV: {e}") |
| 15 | + except FileNotFoundError as e: |
| 16 | + print(f"File not found: {e}") |
| 17 | + return [] |
| 18 | + except ValueError as e: |
| 19 | + print(f"Error processing CSV: {e}") |
18 | 20 | return []
|
19 |
| - |
20 | 21 |
|
21 | 22 | def download_image(url, folder):
|
22 | 23 | try:
|
23 |
| - # Send a GET request to the URL |
24 |
| - response = requests.get(url, stream=True) |
25 |
| - response.raise_for_status() # Check if the request was successful |
26 |
| - |
| 24 | + # Send a GET request to the URL with a timeout |
| 25 | + response = requests.get(url, stream=True, timeout=10) |
| 26 | + response.raise_for_status() |
| 27 | + |
27 | 28 | # Generate a unique filename using the URL
|
28 |
| - filename = quote_plus(url) # Encode URL to use as filename |
29 |
| - filename = filename[:25] # Limit filename length (optional) |
30 |
| - filename = f"{filename}.jpg" # Add file extension if needed |
31 |
| - |
| 29 | + filename = quote_plus(url)[:25] # Limit filename length |
| 30 | + filename = f"{filename}.jpg" |
| 31 | + |
32 | 32 | # Create the output path
|
33 | 33 | output_path = os.path.join(folder, filename)
|
34 |
| - |
| 34 | + |
35 | 35 | # Save the image to the specified folder
|
36 |
| - with open(output_path, "wb") as file: |
| 36 | + with open(output_path, 'wb') as file: |
37 | 37 | for chunk in response.iter_content(8192):
|
38 | 38 | file.write(chunk)
|
39 |
| - |
| 39 | + |
40 | 40 | print(f"Downloaded: {url} to {output_path}")
|
41 | 41 | except requests.exceptions.RequestException as e:
|
42 | 42 | print(f"Failed to download {url}: {e}")
|
43 | 43 |
|
44 |
| - |
45 | 44 | def download_images_from_list(url_list, folder):
|
46 |
| - # Create the output folder if it doesn't exist |
47 | 45 | os.makedirs(folder, exist_ok=True)
|
48 |
| - |
49 |
| - for url in url_list: |
| 46 | + for url in url_list |
50 | 47 | download_image(url, folder)
|
51 | 48 |
|
52 |
| - |
53 | 49 | if __name__ == "__main__":
|
54 |
| - # CSV file containing URLs |
55 |
| - csv_file = "your_csv" |
56 |
| - column_name = ( |
57 |
| - "YOUR COLUMN LINK CONTAING URLS" # Replace with the column name containing URLs |
58 |
| - ) |
59 |
| - |
60 |
| - # Read URLs from CSV |
| 50 | + csv_file = "face_dataset.csv" |
| 51 | + column_name = "Imagelink" |
61 | 52 | image_urls = read_urls_from_csv(csv_file, column_name)
|
62 |
| - |
63 |
| - # Folder to save downloaded images |
64 | 53 | output_folder = "downloaded_images"
|
65 |
| - |
66 |
| - # Download images |
67 | 54 | download_images_from_list(image_urls, output_folder)
|
0 commit comments