6.1 building_the_bot.py
Wed Jan 31 2024 19:54:47 GMT+0000 (Coordinated Universal Time)
Saved by @mnis00014 #python #selenium #scraping
# Handle pagination with Selenium # Scrape Website (www.audible.com) from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import pandas as pd import time path = r"C:\Drivers\chromedriver-win64\chromedriver.exe" website = "https://www.audible.com/search" # Use the Service class to specify the path to chromedriver.exe service = Service(executable_path=path) # Use ChromeOptions for additional configurations options = webdriver.ChromeOptions() options.add_experimental_option("detach", True) # Initialize the WebDriver with the specified service and options driver = webdriver.Chrome(service=service, options=options) # Navigate to the specific website driver.get(website) # Wait for some time to ensure the page is loaded time.sleep(5) try: # Wait for the container to be present container = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'adbl-impression-container')) ) # Wait for the products to be present within the container products = WebDriverWait(container, 10).until( EC.presence_of_all_elements_located((By.XPATH, './/li[contains(@class, "productListItem")]')) ) book_title = [] author_name = [] run_time = [] release_date = [] for product in products: try: # Wait for the book title element to be present within each product book_title_elem = WebDriverWait(product, 5).until( EC.presence_of_element_located((By.XPATH, './/h3[contains(@class, "bc-heading")]')) ) # Append book title book_title.append(book_title_elem.text) # Append author name author_name_elem = product.find_element(By.XPATH, './/li[contains(@class, "authorLabel")]') author_name.append(author_name_elem.text) # Append run time run_time_elem = product.find_element(By.XPATH, './/li[contains(@class, "runtimeLabel")]') run_time.append(run_time_elem.text) # Append release date release_date_elem = product.find_element(By.XPATH, './/li[contains(@class, "releaseDateLabel")]') release_date.append(release_date_elem.text) except TimeoutException: print("Timeout occurred while waiting for element within product.") # Handle the timeout situation here (e.g., skip this product or log the issue) # Create DataFrame and save to CSV df = pd.DataFrame({'book_title': book_title, 'author_name': author_name, 'run_time': run_time, 'release_date': release_date}) df.to_csv('amazon_audible.csv', index=False) print(df) except TimeoutException: print("Timeout occurred while waiting for container element.") # Handle the timeout situation here (e.g., retry navigating to the page or log the issue) finally: # Quit the driver driver.quit()
Comments
@mnis00014 - Tue Feb 06 2024 07:00:13 GMT+0000 (Coordinated Universal Time)book_title is not showing