Skip to content

Added scripts for bulk download and scrapping #11920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions scripts/bulkdownloadfromurls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import requests
from urllib.parse import quote_plus
import pandas as pd

def read_urls_from_csv(csv_file, column_name):
try:
# Read CSV file into a DataFrame
image_data = pd.read_csv(csv_file)

# Extract URLs from specified column
urls = image_data[column_name].tolist()

return urls
except FileNotFoundError as e:
print(f"File not found: {e}")
return []
except ValueError as e:
print(f"Error processing CSV: {e}")
return []

def download_image(url, folder):
try:
# Send a GET request to the URL with a timeout
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()

# Generate a unique filename using the URL
filename = quote_plus(url)[:25] # Limit filename length
filename = f"{filename}.jpg"

# Create the output path
output_path = os.path.join(folder, filename)

# Save the image to the specified folder
with open(output_path, 'wb') as file:
for chunk in response.iter_content(8192):
file.write(chunk)

print(f"Downloaded: {url} to {output_path}")
except requests.exceptions.RequestException as e:
print(f"Failed to download {url}: {e}")

def download_images_from_list(url_list, folder):
os.makedirs(folder, exist_ok=True)
for url in url_list
download_image(url, folder)

Check failure on line 47 in scripts/bulkdownloadfromurls.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff

scripts/bulkdownloadfromurls.py:46:24: SyntaxError: Expected ':', found newline

if __name__ == "__main__":
csv_file = "face_dataset.csv"
column_name = "Imagelink"
image_urls = read_urls_from_csv(csv_file, column_name)
output_folder = "downloaded_images"
download_images_from_list(image_urls, output_folder)
Loading