thiscodeWorks - Organizing the best of code online

List view

Scrape Google News results using SerpApi

import os

# https://github.com/serpapi/google-search-results-python
from serpapi import GoogleSearch

params = {
  "engine": "google",
  "q": "cute animals",
  "tbm": "nws",
  "api_key": os.getenv("API_KEY"),
}

search = GoogleSearch(params)

pages = search.pagination()

for result in pages:
  print(f"Current page: {result['serpapi_pagination']['current']}")

  for news_result in result["news_results"]:
      print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")

#webscraping #googlenews

Scrape Google News with Pagination

from bs4 import BeautifulSoup
import requests, lxml, urllib.parse


def print_extracted_data_from_url(url):
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
    }
    response = requests.get(url, headers=headers).text

    soup = BeautifulSoup(response, 'lxml')

    print(f'Current page: {int(soup.select_one(".YyVfkd").text)}')
    print(f'Current URL: {url}')
    print()

    for container in soup.findAll('div', class_='tF2Cxc'):
        head_text = container.find('h3', class_='LC20lb DKV0Md').text
        head_sum = container.find('div', class_='IsZvec').text
        head_link = container.a['href']
        print(head_text)
        print(head_sum)
        print(head_link)
        print()

    return soup.select_one('a#pnnext')


def scrape():
    next_page_node = print_extracted_data_from_url(
        'https://www.google.com/search?hl=en-US&q=cute+animals')

    while next_page_node is not None:
        next_page_url = urllib.parse.urljoin('https://www.google.com',
                                             next_page_node['href'])

        next_page_node = print_extracted_data_from_url(next_page_url)

scrape()

Scrape Google News results using SerpApi

Scrape Google News with Pagination

Save snippets that work with our extensions