Extract Title Terms from Google

PHOTO EMBED

Wed Dec 13 2023 01:05:53 GMT+0000 (Coordinated Universal Time)

Saved by @richtatum #seo #javascript #js #google #serp

// Variable 'search' initialized with the string that will be used for the Google search
let search = "Google * and SEO";

// Replace the first space character in the 'search' string with a plus sign
let searchQ = search.replace(' ', '+').trim();
// Construct the Google search URL using the modified 'searchQ' string and setting the results limit to 1000
let searchUrl = `https://www.google.com/search?q=%22${searchQ}%22&num=1000`;

// URL to a raw list of English stop words from the NLTK library hosted on GitHub
let stopwordsUrl = "https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%20list%20of%20english%20stopwords";

// Initialize 'stopWords' as a Set to store unique stop words
let stopWords = new Set();

// Asynchronously fetch the list of stopwords from the provided URL
fetch(stopwordsUrl)
  .then(response => {
    // Check if the network response is ok; otherwise throw an error
    if (!response.ok) throw new Error('Network response was not ok');
    return response.text(); // Return the response text (stop words) to be processed
  })
  .then(stopwordsData => {
    // Split the stopwords data by newlines and add each trimmed word to the 'stopWords' Set
    stopwordsData.split(/\n/).forEach(word => stopWords.add(word.trim()));
    return fetch(searchUrl); // Fetch the Google search results next
  })
  .then(response => {
    // Check if the network response is ok; otherwise throw an error
    if (!response.ok) throw new Error('Network response was not ok');
    return response.text(); // Return the search HTML to be processed
  })
  .then(data => {
    // Parse the returned HTML string into a DOM Document object
    let _htmlDoc = new DOMParser().parseFromString(data, "text/html");

    // Define a threshold percentile for word frequency analysis
    const bottomPercentile = 0.98;

    // Process and filter h3 text content from the Google search results
    let processedTexts = Array.from(_htmlDoc.querySelectorAll('h3')).map(h3 => 
      h3.textContent.trim().toLowerCase() // Remove whitespace, convert to lower case
      .replace(/[^\w\s]|_/g, "") // Remove punctuation and underscores
      .split(/\s+/).filter(word => !stopWords.has(word)) // Split into words and filter out stop words
    );

    // Count the frequency of each word across all h3 elements
    let wordCounts = processedTexts.flatMap(words => words).reduce((acc, word) => {
        acc[word] = (acc[word] || 0) + 1; // Increment word count or initialize it to 1
        return acc;
    }, {});

    // Sort the frequencies to determine the threshold for common words
    let sortedCounts = Object.values(wordCounts).sort((a, b) => a - b);
    let thresholdIndex = Math.floor(sortedCounts.length * bottomPercentile);
    let thresholdValue = sortedCounts[thresholdIndex];

    // Filter out the words that are more frequent than the threshold
    let frequentWords = new Set(Object.keys(wordCounts).filter(word => wordCounts[word] > thresholdValue));

    // Reconstruct texts by removing the frequent words and ensure they are more than single words
    let reconstructedText = new Set(processedTexts
      .map(words => words.filter(word => !frequentWords.has(word)).join(' '))
      .filter(text => text.split(' ').length > 1));

    // Log each reconstructed text to the console
    reconstructedText.forEach(text => console.log(text));
  })
  .catch(error => console.error('Fetch error:', error)); // Catch and log any errors during the fetch process
content_copyCOPY

https://snippets.cacher.io/snippet/43d97624291091879319