from bs4 import BeautifulSoup import requests, lxml, urllib.parse def print_extracted_data_from_url(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582" } response = requests.get(url, headers=headers).text soup = BeautifulSoup(response, 'lxml') print(f'Current page: {int(soup.select_one(".YyVfkd").text)}') print(f'Current URL: {url}') print() for container in soup.findAll('div', class_='tF2Cxc'): head_text = container.find('h3', class_='LC20lb DKV0Md').text head_sum = container.find('div', class_='IsZvec').text head_link = container.a['href'] print(head_text) print(head_sum) print(head_link) print() return soup.select_one('a#pnnext') def scrape(): next_page_node = print_extracted_data_from_url( 'https://www.google.com/search?hl=en-US&q=cute+animals') while next_page_node is not None: next_page_url = urllib.parse.urljoin('https://www.google.com', next_page_node['href']) next_page_node = print_extracted_data_from_url(next_page_url) scrape()
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter