📸 Automate Website Screenshots with Python and Selenium

OSbooter

20/06/2024

41 Views 0

SaveSavedRemoved 0

🔧 Step-by-Step Guide

1. Setting Up the Environment

First, let’s get our environment ready by installing the necessary packages. Open your terminal and run:

pip install selenium webdriver-manager

2. Creating the Script

Now, let’s create a Python script named take_screenshots.py. This script will:

Load URLs from a CSV file.
Use Selenium to open each URL.
Handle cookie consent pop-ups.
Take a screenshot and save it with a sanitized filename.

Here’s the script:

import csv
import os
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def load_urls_from_csv(csv_filename):
    urls = []
    with open(csv_filename, newline='', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)  # Skip header row
        for row in csvreader:
            if row:  # Ensure row is not empty
                urls.append(row[0])
    return urls

def accept_cookies(driver):
    try:
        consent_button_xpaths = [
            "//button[contains(text(), 'Accept')]",
            "//button[contains(text(), 'accept')]",
            "//button[contains(text(), 'Agree')]",
            "//button[contains(text(), 'agree')]",
            "//button[contains(text(), 'Allow')]",
            "//button[contains(text(), 'allow')]",
            "//button[contains(text(), 'OK')]",
            "//button[contains(text(), 'Ok')]",
            "//button[contains(text(), 'Got it')]",
            "//button[contains(text(), 'got it')]",
            "//button[contains(text(), 'I agree')]",
            "//button[contains(text(), 'I accept')]",
            "//button[contains(@class, 'accept')]",
            "//button[contains(@class, 'agree')]",
            "//button[contains(@class, 'allow')]",
            "//button[contains(@class, 'ok')]",
            "//button[contains(@class, 'consent')]",
            "//button[contains(@class, 'cookie')]"
        ]
        for xpath in consent_button_xpaths:
            try:
                consent_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, xpath))
                )
                consent_button.click()
                print(f"Cookie consent button clicked: {xpath}")
                return
            except Exception as e:
                pass
        print("No cookie consent button found.")
    except Exception as e:
        print(f"Error finding cookie consent button: {e}")

def sanitize_filename(url):
    filename = re.sub(r'[^a-zA-Z0-9_\-]', '_', url)
    return filename[:255]  # Ensure the filename is not too long

def take_screenshot(url, output_path):
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")

        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        accept_cookies(driver)
        WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        driver.save_screenshot(output_path)
        driver.quit()
        print(f"Screenshot saved to {output_path}")
    except Exception as e:
        print(f"Error taking screenshot of {url}: {e}")

csv_filename = 'urls.csv'
urls = load_urls_from_csv(csv_filename)
os.makedirs('screenshots', exist_ok=True)
for url in urls:
    sanitized_filename = sanitize_filename(url) + '.png'
    output_path = os.path.join('screenshots', sanitized_filename)
    take_screenshot(url, output_path)

print("Screenshots taken for all URLs.")