Web scraping Google Arts & Culture Artist Results with Nodejs

#webscraping #node #serpapi

Intro

We currently don't have an API for Google Arts & Culture Artists page.

This blog post is written to show the DIY solution to extract data from "All, A-Z, Time" tabs that can be used for personal use while we're working on releasing our proper API.

The solution can be used for personal use as it doesn't include the Legal US Shield that we offer for our paid production and above plans and has its limitations such as the need to bypass blocks, for example, CAPTCHA.

You can check our public roadmap to track the progress for this API:

🗺️[New API] Google Arts & Culture - Artists

What will be scraped

Full code

If you don't need an explanation, have a look at the full code example in the online IDE

const axios = require("axios");

const AXIOS_OPTIONS = {
  baseURL: "https://artsandculture.google.com",
  headers: {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
  }, // adding the User-Agent header as one way to prevent the request from being blocked
  params: {
    hl: "en", // parameter defines the language to use for the Google search
  },
};
function getResultsFromCategory(categoryContent) {
  const artistsPattern = /cobject","(?<artist>[^"]+)","(?<works>[^ ]+) \w+","(?<thumbnail>[^"]+)","(?<link>[^"]+)/gm; //https://regex101.com/r/DXd78Z/1

  return [...categoryContent.matchAll(artistsPattern)].map(({ groups }) => ({
    artist: groups.artist,
    works: groups.works,
    thumbnail: `https:${groups.thumbnail}`,
    link: `${AXIOS_OPTIONS.baseURL}${JSON.parse(`"${groups.link}"`)}`,
  }));
}

function getArtistsInfo() {
  return axios.get("/category/artist", AXIOS_OPTIONS).then(function ({ data }) {
    const results = {};

    const popularCategoryPattern = /"PopularAssets:(?<content>.+?)\["stella\.pr/gm; //https://regex101.com/r/kyTsgs/1
    [...data.matchAll(popularCategoryPattern)].forEach(({ groups }) => (results.popular = getResultsFromCategory(groups.content)));

    const azCategoryPattern = /"(?<letter>[^"])",\["stella\.pr","(?<content>.+?)[\w"||\d]\]{2,3},\[/gm; //https://regex101.com/r/u1KZFf/1
    [...data.matchAll(azCategoryPattern)].map(({ groups }) => (results[groups.letter] = getResultsFromCategory(groups.content)));

    const timeCategoryPattern = /\[{1,2}"(?<time>[^"]{3,8})","?\w{4,7}.+?\["stella\.pr","DatedAssets(?<content>.+?)"?\d{3,5}"\]/gm; //https://regex101.com/r/5JHaQB/1
    [...data.matchAll(timeCategoryPattern)].map(({ groups }) => (results[groups.time] = getResultsFromCategory(groups.content)));

    return results;
  });
}

getArtistsInfo().then((result) => console.dir(result, { depth: null }));

Preparation

First, we need to create a Node.js* project and add npm package axios to make a request to a website.

To do this, in the directory with our project, open the command line and enter:

$ npm init -y   # create new project

And then:

$ npm i axios  # add axios package

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

Process

We need to extract artists data from the script tags using different regular expressions. The gif below shows you how the data looks like in the page source:

Code explanation

First, we need to declare constant from axios library and write a request options: HTTP headers with User-Agent which is used to act as a "real" user visit, and the necessary parameters for making a request.

Default axios request user-agent is axios/<axios_version> so websites understand that it's a script that sends a request and might block it. Check what's your user-agent:

const axios = require("axios");

const AXIOS_OPTIONS = {
  baseURL: "https://artsandculture.google.com",
  headers: {
    "User-Agent": 
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
  }, // adding the User-Agent header as one way to prevent the request from being blocked
  params: {
    hl: "en", // parameter defines the language to use for the Google search
  },
};

Next, we write a function that extracts and returns artists data:

function getResultsFromCategory(categoryContent) {
    ...
}

In this function we need to declare RegEx artistsPattern, then using spread syntax we make and return an array (map() method) from an iterable iterator of matches, received from matchAll method.

    //https://regex101.com/r/DXd78Z/1
  const artistsPattern = 
        /cobject","(?<artist>[^"]+)","(?<works>[^ ]+) \w+","(?<thumbnail>[^"]+)","(?<link>[^"]+)/gm; 

  return [...categoryContent.matchAll(artistsPattern)].map(({ groups }) => ({
    ...
  }));

To make this array we need to destructure groups and define object with artist info, that contains artist, works , thumbnail and link fields. To make valid link we need to parse some encoded symbols (e.g. "\u003d") with JSON.parse() method:

    artist: groups.artist,
    works: groups.works,
    thumbnail: `https:${groups.thumbnail}`,
    link: `${AXIOS_OPTIONS.baseURL}${JSON.parse(`"${groups.link}"`)}`,

Then, we write a function that makes the request and returns the received data. We received the response from axios request that has data key that we destructured and return it:

function getArtistsInfo() {
  return axios
        .get("/category/artist", AXIOS_OPTIONS)
        .then(function ({ data }) {
    ...
  });
}

In this function we declare the results object, and using different RegEx patterns add new keys with results from each category:

const results = {};

    //https://regex101.com/r/kyTsgs/1
const popularCategoryPattern = /"PopularAssets:(?<content>.+?)\["stella\.pr/gm;
[...data.matchAll(popularCategoryPattern)]
    .forEach(({ groups }) => (results.popular = getResultsFromCategory(groups.content)));

    //https://regex101.com/r/u1KZFf/1
const azCategoryPattern = 
        /"(?<letter>[^"])",\["stella\.pr","(?<content>.+?)[\w"||\d]\]{2,3},\[/gm;
[...data.matchAll(azCategoryPattern)]
    .forEach(({ groups }) => (results[groups.letter] = getResultsFromCategory(groups.content)));

    //https://regex101.com/r/5JHaQB/1
const timeCategoryPattern = 
        /\[{1,2}"(?<time>[^"]{3,8})","?\w{4,7}.+?\["stella\.pr","DatedAssets(?<content>.+?)"?\d{3,5}"\]/gm;
[...data.matchAll(timeCategoryPattern)]
    .forEach(({ groups }) => (results[groups.time] = getResultsFromCategory(groups.content)));

return results;

And finally, we need to run our function and print all the received information in the console with the console.dir method, which allows you to use an object with the necessary parameters to change default output options:

getArtistsInfo().then((result) => console.dir(result, { depth: null }));

Now we can launch our parser:

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

Output

{
   "popular":[
      {
         "artist":"Vincent van Gogh",
         "works":"340",
         "thumbnail":"https://lh3.googleusercontent.com/nizOQO4H8v1F3-2Do1m9Rj1j4baLBqLS57HMbyi83_kZv_F1yFEc-2iqHYqBRGzmjg",
         "link":"https://artsandculture.google.com/entity/vincent-van-gogh/m07_m2?categoryId=artist"
      },
      {
         "artist":"Claude Monet",
         "works":"283",
         "thumbnail":"https://lh3.googleusercontent.com/hML980F4Qz51OPJqearbH_aPSxUPGFC7MaPEgYaxaG1CVdh8eh25aMtY67XkP6ZbnA",
         "link":"https://artsandculture.google.com/entity/claude-monet/m01xnj?categoryId=artist"
      },
      ...and other popular results
   ],
   "A":[
      {
         "artist":"A. B. Frost",
         "works":"5",
         "thumbnail":"https://lh3.ggpht.com/Lfwsu29qks8oAArsSnIrMYXCyAW1eJHSs_zRtV87_kuGOj31LZfabjT14QEg4g",
         "link":"https://artsandculture.google.com/entity/a-b-frost/m06b7cg?categoryId=artist"
      },
      {
         "artist":"A. J. Casson",
         "works":"13",
         "thumbnail":"https://lh3.googleusercontent.com/oPxgz35wxodv8998Nsarup0c78_gOey6FoR9BS2oHm303-g3F_I3yrjD9GooE8IQ5-k",
         "link":"https://artsandculture.google.com/entity/a-j-casson/m0695mj?categoryId=artist"
      }
      ...and other "A" results
   ],
   ...and other "A-Z" results
   "Far Past":[
      {
         "artist":"Titian",
         "works":"141",
         "thumbnail":"https://lh3.googleusercontent.com/uwSkz-wNQt6ts1mgb0GNgUpllFY_4I6Pa7W5XrAHkzVPiPZeekbS6KLEjvL2OJB41QA",
         "link":"https://artsandculture.google.com/entity/titian/m0144mv?categoryId=artist"
      },
      {
         "artist":"Sandro Botticelli",
         "works":"63",
         "thumbnail":"https://lh3.googleusercontent.com/M7LhlnWSe_9NS4IMW62GWpAJcxnEtBd2vjwDGuEVz3P0Qp-7IpASzVvXRFE-HBHL",
         "link":"https://artsandculture.google.com/entity/sandro-botticelli/m0jr3g?categoryId=artist"
      },
      ...and other "Far Past" results
   ],
   ...and other "Time" results
}