import os import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin import xml.etree.ElementTree as ET # Constants SITEMAP_URL = 'https://ultimaterides.com/post-sitemap1.xml' OLD_DOMAIN = 'old.ultimaterides.com' NEW_DOMAIN = 'ultimaterides.com' BASE_DOWNLOAD_DIR = './downloaded_images' # Change this to your desired directory def check_image(url): try: response = requests.head(url) return response.status_code == 200 except requests.RequestException: return False def download_image(url, save_path): try: response = requests.get(url, stream=True) if response.status_code == 200: os.makedirs(os.path.dirname(save_path), exist_ok=True) with open(save_path, 'wb') as file: for chunk in response.iter_content(1024): file.write(chunk) print(f"Downloaded: {url} to {os.path.abspath(save_path)}") else: print(f"Failed to download: {url}, status code: {response.status_code}") except requests.RequestException as e: print(f"Error downloading {url}: {e}") def find_and_fix_images(blog_post_url): response = requests.get(blog_post_url) soup = BeautifulSoup(response.content, 'html.parser') images = soup.find_all('img') for img in images: img_url = img.get('src') if not img_url: continue if NEW_DOMAIN in img_url and not check_image(img_url): old_img_url = img_url.replace(NEW_DOMAIN, OLD_DOMAIN) if check_image(old_img_url): parsed_url = urlparse(img_url) save_path = os.path.join(BASE_DOWNLOAD_DIR, parsed_url.path.lstrip('/')) download_image(old_img_url, save_path) else: print(f"Image not found on old domain: {old_img_url}") else: print(f"Image found: {img_url} or Image is not broken") def get_post_urls(sitemap_url): response = requests.get(sitemap_url) root = ET.fromstring(response.content) namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} urls = [elem.text for elem in root.findall('.//ns:loc', namespaces)] return urls def main(): post_urls = get_post_urls(SITEMAP_URL) for post_url in post_urls: print(f"Checking blog post: {post_url}") find_and_fix_images(post_url) if __name__ == '__main__': main()