DEV Community πŸ‘©β€πŸ’»πŸ‘¨β€πŸ’»

Cover image for Web Scraping Google Finance Main Page with Python
Dmitriy Zub β˜€οΈ for SerpApi

Posted on • Originally published at serpapi.com

Web Scraping Google Finance Main Page with Python

What will be scraped

image

Full Code

import requests, json, re
from parsel import Selector


def scrape_google_finance_main_page():
    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    # https://www.whatismybrowser.com/detect/what-is-my-user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
        }

    html = requests.get(f"https://www.google.com/finance/", headers=headers, timeout=30)
    selector = Selector(text=html.text)

    # where all extracted data will be temporary located
    ticker_data = {
        "market_trends": [],
        "interested_in": {
            "top_position": [],
            "bottom_position": []
        },
        "earning_calendar": [],
        "most_followed_on_google": [],
        "news": [],
    }

    # Market trends top results
    ticker_data["market_trends"] = selector.css(".gR2U6::text").getall()

    # Earnings calendar results
    for calendar_quote in selector.css(".d3fRjc"):
        ticker_data["earning_calendar"].append({
            "quote": calendar_quote.css(".yaubCc::text").get(),
            "quote_link": f'https://www.google.com/finance/quote{calendar_quote.css(".yaubCc::attr(href)").get().replace("./quote/", "/")}',
            "short_date": calendar_quote.css(".JiAI5b").xpath("normalize-space()").get(),
            "full_date": calendar_quote.css(".fVovwd::text").get()
        })

    # Most followed on Google results
    for google_most_followed in selector.css(".NaLFgc"):
        current_percent_change_raw_value = google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"by\s?(\d+\.\d+)%", google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group(1)

        ticker_data["most_followed_on_google"].append({
            "title": google_most_followed.css(".TwnKPb::text").get(),
            "quote": re.search(r"\.\/quote\/(\w+):",google_most_followed.attrib["href"]).group(1),            # https://regex101.com/r/J3DDIX/1
            "following": re.search(r"(\d+\.\d+)M", google_most_followed.css(".Iap8Fc::text").get()).group(1), # https://regex101.com/r/7ptVha/1
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    # news results. If empty -> run once again. For some reason it could return [].
    for index, news in enumerate(selector.css(".yY3Lee"), start=1):
        ticker_data["news"].append({
            "position": index,
            "title": news.css(".Yfwt5::text").get(),
            "link": news.css(".z4rs2b a::attr(href)").get(),
            "source": news.css(".sfyJob::text").get(),
            "published": news.css(".Adak::text").get(),
            "thumbnail": news.css("img.Z4idke::attr(src)").get()
        })

    # "you may be interested in" at the top of the page results
    for index, interested_top in enumerate(selector.css(".sbnBtf:not(.xJvDsc) .SxcTic"), start=1):
        current_percent_change_raw_value = interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        ticker_data["interested_in"]["top_position"].append({
            "index": index,
            "title": interested_top.css(".ZvmM7::text").get(),
            "quote": interested_top.css(".COaKTb::text").get(),
            "price_change": interested_top.css(".SEGxAb .P2Luy::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    # "you may be interested in" at the bottom of the page results
    for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
        # single function to handle both top and bottom 
        # "you may be interested results" as selectors is identical

        current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        ticker_data["interested_in"]["bottom_position"].append({
            "position": index,
            "ticker": interested_bottom.css(".COaKTb::text").get(),
            "ticker_link": f'https://www.google.com/finance{interested_bottom.attrib["href"].replace("./", "/")}',
            "title": interested_bottom.css(".RwFyvf::text").get(),
            "price": interested_bottom.css(".YMlKec::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    return ticker_data


print(json.dumps(scrape_google_finance_main_page(), indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

Prerequisites

Install libraries:

pip install requests parsel
Enter fullscreen mode Exit fullscreen mode

Basic knowledge scraping with CSS selectors

CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.

If you haven't scraped with CSS selectors, there's a dedicated blog post of mine
about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they matter from a web-scraping perspective.

Separate virtual environment

In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other in the same system thus preventing libraries or Python version conflicts.

If you didn't work with a virtual environment before, have a look at the
dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get a little bit more familiar.

πŸ“ŒNote: this is not a strict requirement for this blog.

Reduce the chance of being blocked

There's a chance that a request might be blocked. Have a look
at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.

Code Explanation

Import libraries:

import requests, json, re
from parsel import Selector
Enter fullscreen mode Exit fullscreen mode
Library Purpose
requests to make a request to the website.
json to convert extracted data to a JSON object.
re to extract parts of the data via regular expression.
parsel to parse data from HTML/XML documents. Similar to BeautifulSoup.

Create request headers:

# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
Enter fullscreen mode Exit fullscreen mode
Library Purpose
user-agent to act as a "real" user request from the browser by passing it to request headers. Check what's your user-agent.

Pass requests headers to requests.get() while making a request, and pass response to parsel:

html = requests.get(f"https://www.google.com/finance/", headers=headers, timeout=30)
selector = Selector(text=html.text)
Enter fullscreen mode Exit fullscreen mode
Code Explanation
timeout=30 to stop waiting for response after 30 seconds.
Selector(text=html.text) where passed HTML from the response will be processed by parsel.
text= is a parsel argument that accepts str object from where HTML nodes will be extracted.

Create an empty dictionary structure where all the data will be filled in later:

# where all extracted data will be temporary located
ticker_data = {
    "market_trends": [],
    "interested_in": {
        "top_position": [],
        "bottom_position": []
    },
    "earning_calendar": [],
    "most_followed_on_google": [],
    "news": [],
}
Enter fullscreen mode Exit fullscreen mode

Extracting data:

# Market trends results
ticker_data["market_trends"] = selector.css(".gR2U6::text").getall()

# Earnings calendar results
for calendar_quote in selector.css(".d3fRjc"):
    ticker_data["earning_calendar"].append({
        "quote": calendar_quote.css(".yaubCc::text").get(),
        "quote_link": f'https://www.google.com/finance/quote{calendar_quote.css(".yaubCc::attr(href)").get().replace("./quote/", "/")}',
        "short_date": calendar_quote.css(".JiAI5b").xpath("normalize-space()").get(),
        "full_date": calendar_quote.css(".fVovwd::text").get()
    })

# Most followed on Google results
for google_most_followed in selector.css(".NaLFgc"):
    current_percent_change_raw_value = google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"by\s?(\d+\.\d+)%", google_most_followed.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group(1)

    ticker_data["most_followed_on_google"].append({
        "title": google_most_followed.css(".TwnKPb::text").get(),
        "quote": re.search(r"\.\/quote\/(\w+):",google_most_followed.attrib["href"]).group(1),            # https://regex101.com/r/J3DDIX/1
        "following": re.search(r"(\d+\.\d+)M", google_most_followed.css(".Iap8Fc::text").get()).group(1), # https://regex101.com/r/7ptVha/1
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })

# news results. If empty -> run once again. For some reason it could return [].
for index, news in enumerate(selector.css(".yY3Lee"), start=1):
    ticker_data["news"].append({
        "position": index,
        "title": news.css(".Yfwt5::text").get(),
        "link": news.css(".z4rs2b a::attr(href)").get(),
        "source": news.css(".sfyJob::text").get(),
        "published": news.css(".Adak::text").get(),
        "thumbnail": news.css("img.Z4idke::attr(src)").get()
    })

# "you may be interested in" at the bottom of the page results
for index, interested_top in enumerate(selector.css(".sbnBtf:not(.xJvDsc) .SxcTic"), start=1):
    current_percent_change_raw_value = interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_top.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

    ticker_data["interested_in"]["top_position"].append({
        "index": index,
        "title": interested_top.css(".ZvmM7::text").get(),
        "quote": interested_top.css(".COaKTb::text").get(),
        "price_change": interested_top.css(".SEGxAb .P2Luy::text").get(),
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })

  # "you may be interested in" at the bottom of the page results
  for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
      # single function to handle both top and bottom 
      # "you may be interested results" as selectors is identical

      current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
      current_percent_change = re.search(r"\d{1}%|\d{1,10}\.\d{1,2}%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

      ticker_data["interested_in"]["bottom_position"].append({
          "position": index,
          "ticker": interested_bottom.css(".COaKTb::text").get(),
          "ticker_link": f'https://www.google.com/finance{interested_bottom.attrib["href"].replace("./", "/")}',
          "title": interested_bottom.css(".RwFyvf::text").get(),
          "price": interested_bottom.css(".YMlKec::text").get(),
          "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
      })
Enter fullscreen mode Exit fullscreen mode
Code Explanation
ticker_data["market_trends"] accesses ["ticker_data"] key and creates a new key ["market_trends"] and assigns it to whatever value would be extracted by parsel.
css() to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood.
[jsname=Fe7oBc] is a CSS selector that used to select elements with the specified attribute and value e.g. [attribute=value].
append() to append extracted data to the list as dictionary.
getall() to get all a list of matches.
get() to get actual data.
::text or ::attr(<attribute>) to extract textual or attribute data from the node.
xpath("normalize-space()") to parse blank text node as well. By default, blank text node is be skipped by XPath.
replace("<something>", "<with_something>") to replace something old with something new in a string.
enumerate() to add a counter to an iterable and return it. start=1 will start counting from 1, instead from the default value of 0.
re.search() to match parts of the string and grab only digit values. group() to return matched string by a regular expression.

Return and print the data:

# ticker_data = {
#     "market_trends": [],
#     "interested_in": {
#         "top_position": [],
#         "bottom_position": []
#     },
#     "earning_calendar": [],
#     "most_followed_on_google": [],
#     "news": [],
# }

# extraction code...

return ticker_data

print(json.dumps(scrape_google_finance_main_page(), indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

Full output:

{
  "market_trends": {
    "top_position": [
      "Market indexes",
      "Most active",
      "Gainers",
      "Losers",
      "Climate leaders",
      "Crypto",
      "Currencies"
    ],
    "bottom_position": [
      {
        "index": 1,
        "title": "Tesla Inc",
        "quote": "TSLA",
        "price": "$824.46",
        "price_percent_change": "+0.59%"
      }, ... other results
      {
        "index": 6,
        "title": "BEL 20",
        "quote": "Index",
        "price": "3,774.05",
        "price_percent_change": "+1.15%"
      }
    ]
  },
  "interested_in": {
    "top_position": [
      {
        "index": 1,
        "title": "Tesla Inc",
        "quote": "TSLA",
        "price_change": "+$47.88",
        "percent_price_change": "+6.17%"
      }, ... other results
      {
        "index": 6,
        "title": "BEL 20",
        "quote": "Index",
        "price_change": "+22.01",
        "percent_price_change": "+0.59%"
      }
    ],
    "bottom_position": [
      {
        "position": 1,
        "ticker": "Index",
        "ticker_link": "https://www.google.com/finance/quote/BEL20:INDEXEURO",
        "title": "BEL 20",
        "price": "3,774.05",
        "percent_price_change": "+0.59%"
      }, ... other results
      {
        "position": 18,
        "ticker": "PFE",
        "ticker_link": "https://www.google.com/finance/quote/PFE:NYSE",
        "title": "Pfizer Inc.",
        "price": "$51.95",
        "percent_price_change": "-0.67%"
      }
    ]
  },
  "earning_calendar": [
    {
      "quote": "Apple",
      "quote_link": "https://www.google.com/finance/quote/AAPL:NASDAQ",
      "short_date": "Jul28",
      "full_date": "Jul 28, 2022, 11:00 PM"
    }, ... other results
    {
      "quote": "Occidental Petroleum",
      "quote_link": "https://www.google.com/finance/quote/OXY:NYSE",
      "short_date": "Aug2",
      "full_date": "Aug 2, 2022, 10:00 PM"
    }
  ],
  "most_followed_on_google": [
    {
      "title": "Apple Inc",
      "quote": "AAPL",
      "following": "3.71",
      "percent_price_change": "+3.42"
    }, ... other results
    {
      "title": "Tesla Inc",
      "quote": "TSLA",
      "following": "1.49",
      "percent_price_change": "+6.17"
    }
  ],
  "news": [
    {
      "position": 1,
      "title": "This kind of shock to the economy will have consequences",
      "link": "https://www.cnn.com/2022/07/27/politics/fed-interest-rate-volcker-what-matters/index.html",
      "source": "CNN",
      "published": "10 hours ago",
      "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcLNm7uU5YfuvveVMWNvlQGUMcCPi4-7QJAqfKcDJgq7A3n1E_wiy53--_FFA"
    }, ... other news
    {
      "position": 9,
      "title": "The 20 Best Netflix Shows of All Time -- Ranked",
      "link": "https://www.rollingstone.com/tv-movies/tv-movie-lists/best-netflix-shows-1386323/",
      "source": "Rolling Stone",
      "published": "20 hours ago",
      "thumbnail": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSsaABpxAxYW29MTnyeSHb1Z9ex1bMvXQQnFB5RJqz9LogWOR9zyOKw9YrjClI"
    }
  ]
}
Enter fullscreen mode Exit fullscreen mode

Join us on Twitter | YouTube

Add a Feature RequestπŸ’« or a Bug🐞

Top comments (0)

🌚 Life is too short to browse without dark mode