# Handle pagination with Selenium
# Scrape Website (www.audible.com)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
path = r"C:\Drivers\chromedriver-win64\chromedriver.exe"
website = "https://www.audible.com/search"
# Use the Service class to specify the path to chromedriver.exe
service = Service(executable_path=path)
# Use ChromeOptions for additional configurations
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
# Initialize the WebDriver with the specified service and options
driver = webdriver.Chrome(service=service, options=options)
# Navigate to the specific website
driver.get(website)
# Wait for some time to ensure the page is loaded
time.sleep(5)
try:
# Wait for the container to be present
container = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'adbl-impression-container'))
)
# Wait for the products to be present within the container
products = WebDriverWait(container, 10).until(
EC.presence_of_all_elements_located((By.XPATH, './/li[contains(@class, "productListItem")]'))
)
book_title = []
author_name = []
run_time = []
release_date = []
for product in products:
try:
# Wait for the book title element to be present within each product
book_title_elem = WebDriverWait(product, 5).until(
EC.presence_of_element_located((By.XPATH, './/h3[contains(@class, "bc-heading")]'))
)
# Append book title
book_title.append(book_title_elem.text)
# Append author name
author_name_elem = product.find_element(By.XPATH, './/li[contains(@class, "authorLabel")]')
author_name.append(author_name_elem.text)
# Append run time
run_time_elem = product.find_element(By.XPATH, './/li[contains(@class, "runtimeLabel")]')
run_time.append(run_time_elem.text)
# Append release date
release_date_elem = product.find_element(By.XPATH, './/li[contains(@class, "releaseDateLabel")]')
release_date.append(release_date_elem.text)
except TimeoutException:
print("Timeout occurred while waiting for element within product.")
# Handle the timeout situation here (e.g., skip this product or log the issue)
# Create DataFrame and save to CSV
df = pd.DataFrame({'book_title': book_title,
'author_name': author_name,
'run_time': run_time,
'release_date': release_date})
df.to_csv('amazon_audible.csv', index=False)
print(df)
except TimeoutException:
print("Timeout occurred while waiting for container element.")
# Handle the timeout situation here (e.g., retry navigating to the page or log the issue)
finally:
# Quit the driver
driver.quit()