import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException, TimeoutException import csv URL = "https://www.theinfatuation.com/new-york/guides" chrome_options = webdriver.ChromeOptions() chrome_options.add_experimental_option("detach", True) print("Starting the browser...") driver = webdriver.Chrome(options=chrome_options) driver.get(URL) # Close the modal if it appears try: wait = WebDriverWait(driver, 10) no_thanks_button = wait.until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "button[data-click='close'][aria-label='No thanks; close the dialog']")) ) no_thanks_button.click() print("Closed the modal!") except TimeoutException: print("No modal to close!") # close the bottom overlay try: overlay_close_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.ID, "bx-close-inside-2272826")) ) driver.execute_script("arguments[0].click();", overlay_close_button) print("Closed the overlay!") except TimeoutException: print("Overlay not found or already closed.") # Click load more 10 times for _ in range(10): try: load_more_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "div.css-xi606m")) ) load_more_button.click() print(f"Clicked load more {_+1} times.") # Adding a sleep duration to allow data to be rendered time.sleep(5) except TimeoutException: print( f"Couldn't click 'load more' the {_+1}-th time. Continuing with the available data.") break # Extract data guide_elements = driver.find_elements( By.CSS_SELECTOR, "a[data-testid='detailedStory-link']") guide_data = [guide.get_attribute('href') for guide in guide_elements] # CSV output setup with open("output.csv", "w", newline="") as csvfile: fieldnames = ["Article Title", "URL", "Restaurants", "Date"] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for link in guide_data: driver.get(link) try: # CSS selector to get the article title title = driver.find_element( By.CSS_SELECTOR, "h1 > .styles_title__QfDF5").text # Extracting the date date_element = driver.find_element( By.CSS_SELECTOR, "div.styles_contributorsList__EKq26 time") date = date_element.get_attribute("datetime") # CSS selector to get restaurant names restaurants = [restaurant.text for restaurant in driver.find_elements( By.CSS_SELECTOR, "h2.chakra-heading.styles_mainHeading__e4VAy.flatplan_venue-heading.css-67umdg")] # Print data print("Article Title:", title) print("URL:", link) print("Restaurants:", ", ".join(restaurants)) print("Date:", date) print("------") writer.writerow({"Article Title": title, "URL": link, "Restaurants": ", ".join(restaurants), "Date": date}) except NoSuchElementException: print( f"Couldn't retrieve complete data for article at URL: {link}") driver.quit()
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter