Mikhail Zub for SerpApi

Posted on Aug 16, 2022

Web scraping YouTube video page with Nodejs

#webscraping #node #serpapi #youtube

What will be scraped

Full code

If you don't need explanation, have a look at full code example in the online IDE

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const videoLink = "https://www.youtube.com/watch?v=fou37kNbsqE"; // link to video page

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
  while (true) {
    await page.evaluate(`window.scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

async function fillDataFromPage(page, newDesign) {
  const dataFromPage = await page.evaluate((newDesign) => {
    const date = document
      .querySelector(newDesign ? "#description-inline-expander > yt-formatted-string span:nth-child(3)" : "#info-strings yt-formatted-string")
      ?.textContent.trim();
    const views = document
      .querySelector(newDesign ? "#description-inline-expander > yt-formatted-string span:nth-child(1)" : "#info-text #count")
      ?.textContent.trim();
    return {
      title: document.querySelector(`${newDesign ? "#title >" : "#info-contents"} h1`)?.textContent.trim(),
      likes: parseInt(
        document
          .querySelector(`${newDesign ? "#top-row" : "#menu"} #top-level-buttons-computed > ytd-toggle-button-renderer:first-child #text`)
          ?.getAttribute("aria-label")
          .replace(",", "")
      ),
      channel: {
        name: document.querySelector(`${newDesign ? "#owner" : "ytd-video-owner-renderer"} #channel-name #text > a`)?.textContent.trim(),
        link: `https://www.youtube.com${document.querySelector(`${newDesign ? "#owner" : ""} ytd-video-owner-renderer > a`)?.getAttribute("href")}`,
        thumbnail: document.querySelector(`${newDesign ? "#owner" : "ytd-video-owner-renderer"} #avatar #img`)?.getAttribute("src"),
      },
      date,
      views: views && parseInt(views.replace(",", "")),
      description: newDesign
        ? document.querySelector("#description-inline-expander > yt-formatted-string")?.textContent.replace(date, "").replace(views, "").trim()
        : document.querySelector("#meta #description")?.textContent.trim(),
      duration: document.querySelector(".ytp-time-duration")?.textContent.trim(),
      hashtags: Array.from(document.querySelectorAll(`${newDesign ? "#super-title" : "#info-contents .super-title"} a`)).map((el) =>
        el.textContent.trim()
      ),
      suggestedVideos: Array.from(document.querySelectorAll("ytd-compact-video-renderer")).map((el) => ({
        title: el.querySelector("#video-title")?.textContent.trim(),
        link: `https://www.youtube.com${el.querySelector("#thumbnail")?.getAttribute("href")}`,
        channelName: el.querySelector("#channel-name #text")?.textContent.trim(),
        date: el.querySelector("#metadata-line span:nth-child(2)")?.textContent.trim(),
        views: el.querySelector("#metadata-line span:nth-child(1)")?.textContent.trim(),
        duration: el.querySelector("#overlays #text")?.textContent.trim(),
        thumbnail: el.querySelector("#img")?.getAttribute("src"),
      })),
      comments: Array.from(document.querySelectorAll("#contents > ytd-comment-thread-renderer")).map((el) => ({
        author: el.querySelector("#author-text")?.textContent.trim(),
        link: `https://www.youtube.com${el.querySelector("#author-text")?.getAttribute("href")}`,
        date: el.querySelector(".published-time-text")?.textContent.trim(),
        likes: el.querySelector("#vote-count-middle")?.textContent.trim(),
        comment: el.querySelector("#content-text")?.textContent.trim(),
        avatar: el.querySelector("#author-thumbnail #img")?.getAttribute("src"),
      })),
    };
  }, newDesign);
  return dataFromPage;
}

async function getYoutubeVideoPageResults() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(videoLink);

  await page.waitForSelector("#contents");

  const isDesign1 = await page.$("#title > h1");

  if (isDesign1) {
    await page.click("#description-inline-expander #expand");
  } else {
    await page.click("#meta #more");
  }
  const scrollContainer = "ytd-app";

  await scrollPage(page, scrollContainer);

  await page.waitForTimeout(10000);

  const infoFromVideoPage = await fillDataFromPage(page, isDesign1);

  await browser.close();

  return infoFromVideoPage;
}

getYoutubeVideoPageResults().then((result) => console.dir(result, { depth: null }));

Preparation

First, we need to create a Node.js* project and add npm packages puppeteer, puppeteer-extra and puppeteer-extra-plugin-stealth to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.

To do this, in the directory with our project, open the command line and enter npm init -y, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth.

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

📌Note: also, you can use puppeteer without any extensions, but I strongly recommended use it with puppeteer-extra with puppeteer-extra-plugin-stealth to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.

Process

SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.

The Gif below illustrates the approach of selecting different parts of the results.

Code explanation

Declare constants from required libraries:

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

Code	Explanation
`puppeteer`	Chromium control library
`StealthPlugin`	library for prevent website detection that you are using web driver

Next, we "say" to puppeteer use StealthPlugin and write link to video page:

puppeteer.use(StealthPlugin());

const videoLink = "https://www.youtube.com/watch?v=fou37kNbsqE"; // link to video page

Next, we write down a function for page scrolling. The first step is to get a current scrollheight of the container:

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);

After that, we need to scroll until no comments left using while loop which let us iterate infinitely until we decide to exit the loop. It's used in combination with evaluate() to execute code in the browser console:

while (true) {
    await page.evaluate(`window.scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);  // waiting 2000 ms before continue

The final step would be to check if the current height is the same as previous height, and if so, break out of the scrolling cycle. Otherwise update current height with the previous height and continue scrolling:

    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

Next, we write down a function for getting data from the video page:

async function fillDataFromPage(page, newDesign) {
  ...
}

In fillDataFromPage next, we write the evaluate function and pass newDesign variable into it, to use it in the page context:

  const dataFromPage = await page.evaluate((newDesign) => {
    ...
  }, newDesign);

Then in the evaluate function we write code to get date and views separately because we will need this data in the future:

    const date = document
      .querySelector(newDesign ? "#description-inline-expander > yt-formatted-string span:nth-child(3)" : "#info-strings yt-formatted-string")
      ?.textContent.trim();
    const views = document
      .querySelector(newDesign ? "#description-inline-expander > yt-formatted-string span:nth-child(1)" : "#info-text #count")
      ?.textContent.trim();

Code	Explanation
`document.querySelector("someSelector")`	returns the first html element with selector `someSelector` which is any child of the `document` html element
`.textContent`	gets the raw text of html element
`.trim()`	removes whitespace from both ends of a string

Next, we get the title and likes using .querySelector() method of the document and .getAttribute("aria-label") method of the selector which was find:

    return {
      title: document.querySelector(`${newDesign ? "#title >" : "#info-contents"} h1`)?.textContent.trim(),
      likes: parseInt(
        document
          .querySelector(`${newDesign ? "#top-row" : "#menu"} #top-level-buttons-computed > ytd-toggle-button-renderer:first-child #text`)
          ?.getAttribute("aria-label")
          .replace(",", "")
      ),

After that, we get channel info, namely the channel's name, link, and thumbnail:

      channel: {
        name: document.querySelector(`${newDesign ? "#owner" : "ytd-video-owner-renderer"} #channel-name #text > a`)?.textContent.trim(),
        link: `https://www.youtube.com${document.querySelector(`${newDesign ? "#owner" : ""} ytd-video-owner-renderer > a`)?.getAttribute("href")}`,
        thumbnail: document.querySelector(`${newDesign ? "#owner" : "ytd-video-owner-renderer"} #avatar #img`)?.getAttribute("src"),
      },

Next, we write the date and views that were received earlier and return it from the evaluate function. Also, we need to delete date and views from description string that received from the new page design with these fields:

      date,
      views: views && parseInt(views.replace(",", "")),
      description: newDesign
        ? document.querySelector("#description-inline-expander > yt-formatted-string")?.textContent.replace(date, "").replace(views, "").trim()
        : document.querySelector("#meta #description")?.textContent.trim(),

Next, we get duration and hashtags. To get hashtags we need to use .querySelectorAll() method which returns a static NodeList representing a list of the document's elements that match the css selectors in the brackets and convert result to an array with Array.from() method:

      duration: document.querySelector(".ytp-time-duration")?.textContent.trim(),
      hashtags: Array.from(document.querySelectorAll(`${newDesign ? "#super-title" : "#info-contents .super-title"} a`)).map((el) =>
        el.textContent.trim()
      ),

Then, we need to get suggestedVideos info, which consists of title, link, channelName, date, views, duration and thumbnail:

      suggestedVideos: Array.from(document.querySelectorAll("ytd-compact-video-renderer")).map((el) => ({
        title: el.querySelector("#video-title")?.textContent.trim(),
        link: `https://www.youtube.com${el.querySelector("#thumbnail")?.getAttribute("href")}`,
        channelName: el.querySelector("#channel-name #text")?.textContent.trim(),
        date: el.querySelector("#metadata-line span:nth-child(2)")?.textContent.trim(),
        views: el.querySelector("#metadata-line span:nth-child(1)")?.textContent.trim(),
        duration: el.querySelector("#overlays #text")?.textContent.trim(),
        thumbnail: el.querySelector("#img")?.getAttribute("src"),
      })),

And the last one, we get all comments with full info (author, link, date, likes, comment and avatar):

      comments: Array.from(document.querySelectorAll("#contents > ytd-comment-thread-renderer")).map((el) => ({
        author: el.querySelector("#author-text")?.textContent.trim(),
        link: `https://www.youtube.com${el.querySelector("#author-text")?.getAttribute("href")}`,
        date: el.querySelector(".published-time-text")?.textContent.trim(),
        likes: el.querySelector("#vote-count-middle")?.textContent.trim(),
        comment: el.querySelector("#content-text")?.textContent.trim(),
        avatar: el.querySelector("#author-thumbnail #img")?.getAttribute("src"),
      })),
    };

Next, write a function to control the browser, and get information:

async function getYoutubeVideoPageResults() {
  ...
}

In this function first we need to define browser using puppeteer.launch({options}) method with current options, such as headless: false and args: ["--no-sandbox", "--disable-setuid-sandbox"]. These options mean that we use headless mode and array with arguments which we use to allow the launch of the browser process in the online IDE. And then we open a new page:

  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

Next, we change default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection with .setDefaultNavigationTimeout() method and go to videoLink URL with .goto() method:

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(videoLink);

Then, we use .waitForSelector() method to wait until #contents selector is creating on the page. Next, we try to find #title > h1 selector and save that into isDesign1 constant with .$() method to click (.click() method) on the correct show more button selector:

  await page.waitForSelector("#contents");

  const isDesign1 = await page.$("#title > h1");

  if (isDesign1) {
    await page.click("#description-inline-expander #expand");
  } else {
    await page.click("#meta #more");
  }

  const scrollContainer = "ytd-app";

  await scrollPage(page, scrollContainer);

  await page.waitForTimeout(10000);

And finally, we get and return data from the page and close the browser:

  const infoFromVideoPage = await fillDataFromPage(page, isDesign1);

  await browser.close();

  return infoFromVideoPage;

Now we can launch our parser. To do this enter node YOUR_FILE_NAME in your command line. Where YOUR_FILE_NAME is the name of your .js file.

Output

{
   "title":"The Life of Luke Skywalker • Entire Timeline Explained (Star Wars)",
   "likes":14699,
   "channel":{
      "name":"MovieFlame",
      "link":"https://www.youtube.com/c/MovieFlame",
      "thumbnail":"https://yt3.ggpht.com/ytc/AMLnZu86EFuWtLin_e9RrleT2PJVyFBMA6u9-QcI7calxQ=s48-c-k-c0x00ffffff-no-rj"
   },
   "date":"Jan 8, 2020",
   "views":708814,
   "description":"Patreon: https://www.patreon.com/MovieFlamePro...\n""+""Twitter: https://twitter.com/MovieFlameProd\n""+""Personal Instagram: https://www.instagram.com/morgan_ross18/\n""+""Facebook: https://www.facebook.com/MovieFlame/\n""+""\n""+""Music- By Ross Bugden https://www.youtube.com/watch?v=9qk-v...",
   "duration":"28:02",
   "hashtags":[

   ],
   "suggestedVideos":[
      {
         "title":"The Life of Obi-Wan Kenobi Explained (Padawan, Clone Wars & Tatooine Years)",
         "link":"https://www.youtube.com/watch?v=2uKLSAyNNQY",
         "channelName":"MovieFlame",
         "date":"4 years ago",
         "views":"2.3M views",
         "duration":"18:23",
         "thumbnail":"https://i.ytimg.com/vi/2uKLSAyNNQY/hqdefault.jpg?sqp=-oaymwEbCKgBEF5IVfKriqkDDggBFQAAiEIYAXABwAEG&rs=AOn4CLCAa04Nks-1bkpApP2bnvPUI48sjg"
      },
        ... and other suggested videos
   ],
   "comments":[
      {
         "author":"MovieFlame",
         "link":"https://www.youtube.com/channel/UCOajpsI8t3Eg-u-s2j_c-cQ",
         "date":"2 years ago (edited)",
         "likes":"765",
         "comment":"Boy did this video take a lot of hard work and a ton of research PLEASE LIKE AND SHARE so my hard work pays off! You guys are the best! :)",
         "avatar":"https://yt3.ggpht.com/ytc/AMLnZu86EFuWtLin_e9RrleT2PJVyFBMA6u9-QcI7calxQ=s48-c-k-c0x00ffffff-no-rj"
      },
        ... and other comments
   ]
}