6.1 building_the_bot.py

PHOTO EMBED

Wed Jan 31 2024 19:54:47 GMT+0000 (Coordinated Universal Time)

Saved by @mnis00014 #python #selenium #scraping

# Handle pagination with Selenium
# Scrape Website (www.audible.com)

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time

path = r"C:\Drivers\chromedriver-win64\chromedriver.exe"
website = "https://www.audible.com/search"

# Use the Service class to specify the path to chromedriver.exe
service = Service(executable_path=path)

# Use ChromeOptions for additional configurations
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)

# Initialize the WebDriver with the specified service and options
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the specific website
driver.get(website)

# Wait for some time to ensure the page is loaded
time.sleep(5)

try:
    # Wait for the container to be present
    container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'adbl-impression-container'))
    )

    # Wait for the products to be present within the container
    products = WebDriverWait(container, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, './/li[contains(@class, "productListItem")]'))
    )

    book_title = []
    author_name = []
    run_time = []
    release_date = []

    for product in products:
        try:
            # Wait for the book title element to be present within each product
            book_title_elem = WebDriverWait(product, 5).until(
                EC.presence_of_element_located((By.XPATH, './/h3[contains(@class, "bc-heading")]'))
            )

            # Append book title
            book_title.append(book_title_elem.text)
            
            # Append author name
            author_name_elem = product.find_element(By.XPATH, './/li[contains(@class, "authorLabel")]')
            author_name.append(author_name_elem.text)

            # Append run time
            run_time_elem = product.find_element(By.XPATH, './/li[contains(@class, "runtimeLabel")]')
            run_time.append(run_time_elem.text)

            # Append release date
            release_date_elem = product.find_element(By.XPATH, './/li[contains(@class, "releaseDateLabel")]')
            release_date.append(release_date_elem.text)

        except TimeoutException:
            print("Timeout occurred while waiting for element within product.")
            # Handle the timeout situation here (e.g., skip this product or log the issue)

    # Create DataFrame and save to CSV
    df = pd.DataFrame({'book_title': book_title,
                       'author_name': author_name,
                       'run_time': run_time,
                       'release_date': release_date})

    df.to_csv('amazon_audible.csv', index=False)
    print(df)

except TimeoutException:
    print("Timeout occurred while waiting for container element.")
    # Handle the timeout situation here (e.g., retry navigating to the page or log the issue)

finally:
    # Quit the driver
    driver.quit()

content_copyCOPY