Mikhail Zub for SerpApi

Posted on Jul 12, 2022

Web Scraping Google Scholar Cite Results with Nodejs

#webscraping #node #serpapi #google

What will be scraped

Preparation

First, we need to create a Node.js* project and add npm packages puppeteer, puppeteer-extra and puppeteer-extra-plugin-stealth to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.

To do this, in the directory with our project, open the command line and enter npm init -y, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth.

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

📌Note: also, you can use puppeteer without any extensions, but I strongly recommended use it with puppeteer-extra with puppeteer-extra-plugin-stealth to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.

Process

SelectorGadget Chrome extension was used to grab CSS selectors by clicking on the desired element in the browser. If you have any struggles understanding this, we have a dedicated Web Scraping with CSS Selectors blog post at SerpApi.
The Gif below illustrates the approach of selecting different parts of the results.

Full code

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const requestParams = {
  q: "astronomy",                                           // what we want to search
  hl: "en",                                                 // parameter defines the language to use for the Google search
};

const domain = `http://scholar.google.com`;
const pagesLimit = Infinity;                                // limit of pages for getting info
let currentPage = 1;

async function getCitesId(page) {
  const citesId = [];
  while (true) {
    await page.waitForSelector(".gs_r.gs_scl");
    const citesIdFromPage = await page.evaluate(async () => {
      return Array.from(document.querySelectorAll(".gs_r.gs_scl")).map((el) => el.getAttribute("data-cid"));
    });
    citesId.push(...citesIdFromPage);
    const isNextPage = await page.$("#gs_n td:last-child a");
    if (!isNextPage || currentPage > pagesLimit) break;
    await page.evaluate(async () => {
      document.querySelector("#gs_n td:last-child a").click();
    });
    await page.waitForTimeout(3000);
    currentPage++;
  }
  return citesId;
}

async function fillCiteData(page) {
  const citeData = await page.evaluate(async () => {
    const citations = Array.from(document.querySelectorAll("#gs_citt tr")).map((el) => {
      return {
        title: el.querySelector("th").textContent.trim(),
        snippet: el.querySelector("td").textContent.trim(),
      };
    });
    const links = Array.from(document.querySelectorAll("#gs_citi a")).map((el) => {
      return {
        name: el.textContent.trim(),
        link: el.getAttribute("href"),
      };
    });
    return { citations, links };
  });
  return citeData;
}

async function getScholarCitesInfo() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  const URL = `${domain}/scholar?hl=${requestParams.hl}&q=${requestParams.q}`;

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);
  await page.waitForSelector(".gs_r.gs_scl");
  await page.waitForTimeout(1000);

  const citesId = await getCitesId(page);
  const allCites = [];

  for (id of citesId) {
    const URL = `${domain}/scholar?q=info:${id}:scholar.google.com/&output=cite&hl=${requestParams.hl}`;
    try {
      await page.goto(URL);
      await page.waitForTimeout(2000);
      allCites.push(await fillCiteData(page));
    } catch {
      console.log("Something was wrong with getting info from ID: ", id)
    }
  }

  await browser.close();

  return allCites;
}

getScholarCitesInfo().then((result) => console.dir(result, { depth: null }));

Code explanation

Declare constants from required libraries:

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

Code	Explanation
`puppeteer`	Chromium control library
`StealthPlugin`	library for prevent website detection that you are using web driver

Next, we "say" to puppeteer use StealthPlugin:

puppeteer.use(StealthPlugin());

Next, we write what we want to search and the necessary parameters for making a request:

const requestParams = {
  q: "astronomy",
  hl: "en",
};

const domain = `http://scholar.google.com`;
const pagesLimit = Infinity;
let currentPage = 1;

Code	Explanation
`q`	search query
`hl`	parameter defines the language to use for the Google Scholar search
`pagesLimit`	limit of pages for getting info. If you want to limit the number of pages for getting info you need to define the last page number in this

Next, we write down a function for getting citations ID from all pages:

async function getCitesId(page) {
  const citesId = [];
  while (true) {
    await page.waitForSelector(".gs_r.gs_scl");
    const citesIdFromPage = await page.evaluate(async () => {
      return Array.from(document.querySelectorAll(".gs_r.gs_scl")).map((el) => el.getAttribute("data-cid"));
    });
    citesId.push(...citesIdFromPage);
    const isNextPage = await page.$("#gs_n td:last-child a");
    if (!isNextPage || currentPage > pagesLimit) break;
    await page.evaluate(async () => {
      document.querySelector("#gs_n td:last-child a").click();
    });
    await page.waitForTimeout(3000);
    currentPage++;
  }
  return citesId;
}

Code	Explanation
`citesId`	an array with cite ID from the all pages
`page.waitForSelector(".gs_r.gs_scl")`	stops the script and waits for the html element with the `.gs_r.gs_scl` selector to load
`page.evaluate(async () => {`	is the Puppeteer method for injecting `function` in the page context and allows to return data directly from the browser
`document.querySelectorAll(".gs_r.gs_scl")`	returns a static NodeList representing a list of the document's elements that match the css selectors with class name `gs_r.gs_scl`
`.getAttribute("data-cid")`	gets the `data-cid` attribute value of the html element
`citesId.push(...citesIdFromPage)`	in this code, we use spread syntax to split the `citesIdFromPage` array into elements and add them in the end of `citesId` array
`page.$("#gs_n td:last-child a");`	this methods finds the html element with the `#gs_n td:last-child a` selector and return it
`document.querySelector(".gsc_a_at")`	returns the first html element with selector `#gs_n td:last-child a` which is any child of the `document` html element
`.click()`	triggers a click event on html element
`page.waitForTimeout(3000)`	waiting 3000 ms before continue

Next, we write down a function for getting citations data from page:

async function fillCiteData(page) {
  const citeData = await page.evaluate(async () => {
    const citations = Array.from(document.querySelectorAll("#gs_citt tr")).map((el) => {
      return {
        title: el.querySelector("th").textContent.trim(),
        snippet: el.querySelector("td").textContent.trim(),
      };
    });
    const links = Array.from(document.querySelectorAll("#gs_citi a")).map((el) => {
      return {
        name: el.textContent.trim(),
        link: el.getAttribute("href"),
      };
    });
    return { citations, links };
  });
  return citeData;
}

Code	Explanation
`.text()`	gets the raw text of html element
`.trim()`	removes whitespace from both ends of a string

And finally, a function to control the browser, and get main information about the author:

async function getScholarCitesInfo() {
  const browser = await puppeteer.launch({
    headless: false,
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  const URL = `${domain}/scholar?hl=${requestParams.hl}&q=${requestParams.q}`;

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);
  await page.waitForSelector(".gs_r.gs_scl");
  await page.waitForTimeout(1000);

  const citesId = await getCitesId(page);
  const allCites = [];

  for (id of citesId) {
    const URL = `${domain}/scholar?q=info:${id}:scholar.google.com/&output=cite&hl=${requestParams.hl}`;
    try {
      await page.goto(URL);
      await page.waitForTimeout(2000);
      allCites.push(await fillCiteData(page));
    } catch {
      console.log("Something was wrong with getting info from ID: ", id)
    }
  }

  await browser.close();

  return allCites;
}

getScholarCitesInfo().then((result) => console.dir(result, { depth: null }));

Code	Explanation
`puppeteer.launch({options})`	this method launches a new instance of the Chromium browser with current `options`
`headless`	defines which mode to use: headless (by default) or non-headless
`args`	an array with arguments which is used with Chromium
`["--no-sandbox", "--disable-setuid-sandbox"]`	these arguments we use to allow the launch of the browser process in the online IDE
`browser.newPage()`	this method launches a new page
`page.setDefaultNavigationTimeout(60000)`	changing default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection
`page.goto(URL)`	navigation to `URL` which is defined above
`browser.close()`	after all we close the browser instance
`console.dir(result, { depth: null })`	console method `dir` allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info

Now we can launch our parser. To do this enter node YOUR_FILE_NAME in your command line. Where YOUR_FILE_NAME is the name of your .js file.

Output

📌Note: if you see something like [Object] in your console you can use console.dir(result, { depth: null }) instead console.log(). Watch Node.js documentation for more info.

[
   {
      "citations":[
         {
            "title":"MLA",
            "snippet":"Feigelson, Eric D., and G. Jogesh Babu. Modern statistical methods for astronomy: with R applications. Cambridge University Press, 2012."
         },
         {
            "title":"APA",
            "snippet":"Feigelson, E. D., & Babu, G. J. (2012). Modern statistical methods for astronomy: with R applications. Cambridge University Press."
         },
         {
            "title":"Chicago",
            "snippet":"Feigelson, Eric D., and G. Jogesh Babu. Modern statistical methods for astronomy: with R applications. Cambridge University Press, 2012."
         },
         {
            "title":"Harvard",
            "snippet":"Feigelson, E.D. and Babu, G.J., 2012. Modern statistical methods for astronomy: with R applications. Cambridge University Press."
         },
         {
            "title":"Vancouver",
            "snippet":"Feigelson ED, Babu GJ. Modern statistical methods for astronomy: with R applications. Cambridge University Press; 2012 Jul 12."
         }
      ],
      "links":[
         {
            "name":"BibTeX",
            "link":"https://scholar.googleusercontent.com/scholar.bib?q=info:ec7TPNOf0BkJ:scholar.google.com/&output=citation&scisdr=CgXMI1ygEIvc-VD2gb4:AAGBfm0AAAAAYsbwmb9DmKveBMs7b13qmAh07Kz7E5wZ&scisig=AAGBfm0AAAAAYsbwmQ_qHPPtGUXxQKyT1ubTz2dZxkNs&scisf=4&ct=citation&cd=-1&hl=en"
         },
         {
            "name":"EndNote",
            "link":"https://scholar.googleusercontent.com/scholar.enw?q=info:ec7TPNOf0BkJ:scholar.google.com/&output=citation&scisdr=CgXMI1ygEIvc-VD2gb4:AAGBfm0AAAAAYsbwmb9DmKveBMs7b13qmAh07Kz7E5wZ&scisig=AAGBfm0AAAAAYsbwmQ_qHPPtGUXxQKyT1ubTz2dZxkNs&scisf=3&ct=citation&cd=-1&hl=en"
         },
         {
            "name":"RefMan",
            "link":"https://scholar.googleusercontent.com/scholar.ris?q=info:ec7TPNOf0BkJ:scholar.google.com/&output=citation&scisdr=CgXMI1ygEIvc-VD2gb4:AAGBfm0AAAAAYsbwmb9DmKveBMs7b13qmAh07Kz7E5wZ&scisig=AAGBfm0AAAAAYsbwmQ_qHPPtGUXxQKyT1ubTz2dZxkNs&scisf=2&ct=citation&cd=-1&hl=en"
         },
         {
            "name":"RefWorks",
            "link":"https://scholar.googleusercontent.com/scholar.rfw?q=info:ec7TPNOf0BkJ:scholar.google.com/&output=citation&scisdr=CgXMI1ygEIvc-VD2gb4:AAGBfm0AAAAAYsbwmb9DmKveBMs7b13qmAh07Kz7E5wZ&scisig=AAGBfm0AAAAAYsbwmQ_qHPPtGUXxQKyT1ubTz2dZxkNs&scisf=1&ct=citation&cd=-1&hl=en"
         }
      ]
   }
   ...and other results
]

Google Scholar Author API

Alternatively, you can use the Google Scholar Cite API from SerpApi. SerpApi is a free API with 100 searches per month. If you need more searches, there are paid plans.

The difference is that you won't have to write code from scratch and maintain it. You may also experience blocking from Google and changing selectors which will break the parser. Instead, you just need to iterate the structured JSON and get the data you want. Check out the playground.

First, we need to install google-search-results-nodejs. To do this you need to enter in your console: npm i google-search-results-nodejs

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const searchString = "astronomy";                         // what we want to search
const pagesLimit = Infinity;                              // limit of pages for getting info
let currentPage = 1;                                      // current page of the search

const params = {
  engine: "google_scholar",                               // search engine
  q: searchString,                                        // search query
  hl: "en",                                               // Parameter defines the language to use for the Google search
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const allCites = [];
  const citesId = [];
  while (true) {
    if (currentPage > pagesLimit) break;
    const json = await getJson();
    json.organic_results.forEach((el) => {
      citesId.push(el.result_id);
    });
    if (json.pagination.next) {
      params.start ? (params.start = 10) : (params.start += 10);
    } else break;
    currentPage++;
  }
  delete params.hl;
  params.engine = "google_scholar_cite";
  for (id of citesId) {
    params.q = id;
    const { citations, links } = await getJson();
    allCites.push({ id, citations, links });
  }
  return allCites;
};


getResults.then((result) => console.dir(result, { depth: null }));

Code explanation

Declare constants from required libraries:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);

Code	Explanation
`SerpApi`	SerpApi Node.js library
`search`	new instance of GoogleSearch class
`API_KEY`	your API key from SerpApi

Next, we write down what we want to search and the necessary parameters for making a request:

const searchString = "astronomy";
const pagesLimit = Infinity;
let currentPage = 1;

const params = {
  engine: "google_scholar",
  q: searchString,
  hl: "en",
};

Code	Explanation
`searchString`	what we want to search
`pagesLimit`	limit of pages for getting info. If you want to limit the number of pages for getting info you need to define the last page number in this
`engine`	search engine
`q`	search query
`hl`	parameter defines the language to use for the Google Scholar search

Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  })
}

And finally, we declare and run the function getResult that gets all citations ID from all pages, gets all citations info and return it:

const getResults = async () => {
  const allCites = [];
  const citesId = [];
  while (true) {
    if (currentPage > pagesLimit) break;
    const json = await getJson();
    json.organic_results.forEach((el) => {
      citesId.push(el.result_id);
    });
    if (json.pagination.next) {
      params.start ? (params.start = 10) : (params.start += 10);
    } else break;
    currentPage++;
  }
  delete params.hl;
  params.engine = "google_scholar_cite";
  for (id of citesId) {
    params.q = id;
    const { citations, links } = await getJson();
    allCites.push({ id, citations, links });
  }
  return allCites;
};

getResults().then((result) => console.dir(result, { depth: null }))

Code	Explanation
`allCites`	an array with all citations info from all pages
`citesId`	an array with cite ID from all pages
`citations, links`	data that we destructured from the response
`console.dir(result, { depth: null })`	console method `dir` allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info

Output

[
   {
      "id":"PkuLyccmJ74J",
      "citations":[
         {
            "title":"MLA",
            "snippet":"Zwicky, Fritz. Morphological astronomy. Springer Science & Business Media, 2012."
         },
         {
            "title":"APA",
            "snippet":"Zwicky, F. (2012). Morphological astronomy. Springer Science & Business Media."
         },
         {
            "title":"Chicago",
            "snippet":"Zwicky, Fritz. Morphological astronomy. Springer Science & Business Media, 2012."
         },
         {
            "title":"Harvard",
            "snippet":"Zwicky, F., 2012. Morphological astronomy. Springer Science & Business Media."
         },
         {
            "title":"Vancouver",
            "snippet":"Zwicky F. Morphological astronomy. Springer Science & Business Media; 2012 Dec 6."
         }
      ],
      "links":[
         {
            "name":"BibTeX",
            "link":"https://scholar.googleusercontent.com/scholar.bib?q=info:PkuLyccmJ74J:scholar.google.com/&output=citation&scisdr=CgU4uY14GAA:AAGBfm0AAAAAYsb-vkn1FCUdWV07MWHG9cBPQ2Vwxm1R&scisig=AAGBfm0AAAAAYsb-vjrBs9xBEzph-DPHmLXeLseRh7s5&scisf=4&ct=citation&cd=-1&hl=en"
         },
         {
            "name":"EndNote",
            "link":"https://scholar.googleusercontent.com/scholar.enw?q=info:PkuLyccmJ74J:scholar.google.com/&output=citation&scisdr=CgU4uY14GAA:AAGBfm0AAAAAYsb-vkn1FCUdWV07MWHG9cBPQ2Vwxm1R&scisig=AAGBfm0AAAAAYsb-vjrBs9xBEzph-DPHmLXeLseRh7s5&scisf=3&ct=citation&cd=-1&hl=en"
         },
         {
            "name":"RefMan",
            "link":"https://scholar.googleusercontent.com/scholar.ris?q=info:PkuLyccmJ74J:scholar.google.com/&output=citation&scisdr=CgU4uY14GAA:AAGBfm0AAAAAYsb-vkn1FCUdWV07MWHG9cBPQ2Vwxm1R&scisig=AAGBfm0AAAAAYsb-vjrBs9xBEzph-DPHmLXeLseRh7s5&scisf=2&ct=citation&cd=-1&hl=en"
         },
         {
            "name":"RefWorks",
            "link":"https://scholar.googleusercontent.com/scholar.rfw?q=info:PkuLyccmJ74J:scholar.google.com/&output=citation&scisdr=CgU4uY14GAA:AAGBfm0AAAAAYsb-vkn1FCUdWV07MWHG9cBPQ2Vwxm1R&scisig=AAGBfm0AAAAAYsb-vjrBs9xBEzph-DPHmLXeLseRh7s5&scisf=1&ct=citation&cd=-1&hl=en"
         }
      ]
   },
   ...and other results
]