import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import csv
URL = "https://www.theinfatuation.com/new-york/guides"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
print("Starting the browser...")
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)
# Close the modal if it appears
try:
wait = WebDriverWait(driver, 10)
no_thanks_button = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "button[data-click='close'][aria-label='No thanks; close the dialog']"))
)
no_thanks_button.click()
print("Closed the modal!")
except TimeoutException:
print("No modal to close!")
# close the bottom overlay
try:
overlay_close_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, "bx-close-inside-2272826"))
)
driver.execute_script("arguments[0].click();", overlay_close_button)
print("Closed the overlay!")
except TimeoutException:
print("Overlay not found or already closed.")
# Click load more 10 times
for _ in range(10):
try:
load_more_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "div.css-xi606m"))
)
load_more_button.click()
print(f"Clicked load more {_+1} times.")
# Adding a sleep duration to allow data to be rendered
time.sleep(5)
except TimeoutException:
print(
f"Couldn't click 'load more' the {_+1}-th time. Continuing with the available data.")
break
# Extract data
guide_elements = driver.find_elements(
By.CSS_SELECTOR, "a[data-testid='detailedStory-link']")
guide_data = [guide.get_attribute('href') for guide in guide_elements]
# CSV output setup
with open("output.csv", "w", newline="") as csvfile:
fieldnames = ["Article Title", "URL", "Restaurants", "Date"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for link in guide_data:
driver.get(link)
try:
# CSS selector to get the article title
title = driver.find_element(
By.CSS_SELECTOR, "h1 > .styles_title__QfDF5").text
# Extracting the date
date_element = driver.find_element(
By.CSS_SELECTOR, "div.styles_contributorsList__EKq26 time")
date = date_element.get_attribute("datetime")
# CSS selector to get restaurant names
restaurants = [restaurant.text for restaurant in driver.find_elements(
By.CSS_SELECTOR, "h2.chakra-heading.styles_mainHeading__e4VAy.flatplan_venue-heading.css-67umdg")]
# Print data
print("Article Title:", title)
print("URL:", link)
print("Restaurants:", ", ".join(restaurants))
print("Date:", date)
print("------")
writer.writerow({"Article Title": title, "URL": link,
"Restaurants": ", ".join(restaurants), "Date": date})
except NoSuchElementException:
print(
f"Couldn't retrieve complete data for article at URL: {link}")
driver.quit()