Preview:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import xml.etree.ElementTree as ET

# Constants
SITEMAP_URL = 'https://ultimaterides.com/post-sitemap1.xml'
OLD_DOMAIN = 'old.ultimaterides.com'
NEW_DOMAIN = 'ultimaterides.com'
BASE_DOWNLOAD_DIR = './downloaded_images'  # Change this to your desired directory

def check_image(url):
    try:
        response = requests.head(url)
        return response.status_code == 200
    except requests.RequestException:
        return False

def download_image(url, save_path):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"Downloaded: {url} to {os.path.abspath(save_path)}")
        else:
            print(f"Failed to download: {url}, status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")

def find_and_fix_images(blog_post_url):
    response = requests.get(blog_post_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    images = soup.find_all('img')
    for img in images:
        img_url = img.get('src')
        if not img_url:
            continue
        
        if NEW_DOMAIN in img_url and not check_image(img_url):
            old_img_url = img_url.replace(NEW_DOMAIN, OLD_DOMAIN)
            if check_image(old_img_url):
                parsed_url = urlparse(img_url)
                save_path = os.path.join(BASE_DOWNLOAD_DIR, parsed_url.path.lstrip('/'))
                download_image(old_img_url, save_path)
            else:
                print(f"Image not found on old domain: {old_img_url}")
        else:
            print(f"Image found: {img_url} or Image is not broken")

def get_post_urls(sitemap_url):
    response = requests.get(sitemap_url)
    root = ET.fromstring(response.content)
    namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    urls = [elem.text for elem in root.findall('.//ns:loc', namespaces)]
    return urls

def main():
    post_urls = get_post_urls(SITEMAP_URL)
    for post_url in post_urls:
        print(f"Checking blog post: {post_url}")
        find_and_fix_images(post_url)

if __name__ == '__main__':
    main()
downloadDownload PNG downloadDownload JPEG downloadDownload SVG

Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!

Click to optimize width for Twitter