DEV Community πŸ‘©β€πŸ’»πŸ‘¨β€πŸ’»

Cover image for Web Scraping Google Finance Markets in Python
Dmitriy Zub β˜€οΈ
Dmitriy Zub β˜€οΈ

Posted on • Originally published at serpapi.com

Web Scraping Google Finance Markets in Python

What will be scraped

image

Full Code

import requests
import json
import re
import argparse
from parsel import Selector

parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")

args = parser.parse_args()

def main():

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    # https://www.whatismybrowser.com/detect/what-is-my-user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
    }

    if args.indexes:
        html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
        return parser(html=html)

    if args.most_active:
        html = requests.get("https://www.google.com/finance/markets/most-active", headers=headers, timeout=30)
        return parser(html=html)

    if args.gainers:
        html = requests.get("https://www.google.com/finance/markets/gainers", headers=headers, timeout=30)
        return parser(html=html)

    if args.losers:
        html = requests.get("https://www.google.com/finance/markets/losers", headers=headers, timeout=30)
        return parser(html=html)

    if args.climate_leaders:
        html = requests.get("https://www.google.com/finance/markets/climate-leaders", headers=headers, timeout=30)
        return parser(html=html)

    if args.crypto:
        html = requests.get("https://www.google.com/finance/markets/cryptocurrencies", headers=headers, timeout=30)
        return parser(html=html)

    if args.currency:
        html = requests.get("https://www.google.com/finance/markets/currencies", headers=headers, timeout=30)
        return parser(html=html)


def parser(html):
    selector = Selector(text=html.text)
    stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")

    data = {
        f"{stock_topic}_trends": [],
        f"{stock_topic}_discover_more": [],
        f"{stock_topic}_news": []
    }

    # news ressults
    for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
        data[f"{stock_topic}_news"].append({
            "position": index,
            "title": news_results.css(".mRjSYb::text").get(),
            "source": news_results.css(".sfyJob::text").get(),
            "date": news_results.css(".Adak::text").get(),
            "image": news_results.css("img::attr(src)").get(),
        })

    # stocks table
    for index, stock_results in enumerate(selector.css("li a"), start=1):
        current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
        quote = stock_results.attrib["href"].replace("./quote/", "")

        data[f"{stock_topic}_trends"].append({
            "position": index,
            "title": stock_results.css(".ZvmM7::text").get(),
            "quote": stock_results.css(".COaKTb::text").get(),
            # "https://www.google.com/finance/MSFT:NASDAQ"
            "quote_link": f"https://www.google.com/finance/{quote}",
            "price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    # "you may be interested in" at the bottom of the page
    for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
        current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        quote = stock_results.attrib["href"].replace("./quote/", "")

        data[f"{stock_topic}_discover_more"].append({
            "position": index,
            "quote": interested_bottom.css(".COaKTb::text").get(),
            "quote_link": f"https://www.google.com/finance{quote}",
            "title": interested_bottom.css(".RwFyvf::text").get(),
            "price": interested_bottom.css(".YMlKec::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    return data



if __name__ == "__main__":
    print(json.dumps(main(), indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

Prerequisites

Install libraries:

pip install requests parsel
Enter fullscreen mode Exit fullscreen mode

Basic knowledge scraping with CSS selectors

CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.

If you haven't scraped with CSS selectors, there's a dedicated blog post of mine
about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they matter from a web-scraping perspective.

Separate virtual environment

In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other in the same system thus preventing libraries or Python version conflicts.

If you didn't work with a virtual environment before, have a look at the
dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get a little bit more familiar.

πŸ“ŒNote: this is not a strict requirement for this blog.

Reduce the chance of being blocked

There's a chance that a request might be blocked. Have a look
at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.

Code Explanation

Import libraries:

import requests
import json
import re
import argparse
from parsel import Selector
Enter fullscreen mode Exit fullscreen mode
Library Purpose
requests to make a request to the website.
json to convert extracted data to a JSON object.
re to extract parts of the data via regular expression.
argparse to extract parts of the data via regular expression.
parsel to parse data from HTML/XML documents. Similar to BeautifulSoup but supports XPath.

Firstly, if we need to parse data by typing command-line arguments without the need to activate certain functions in the code in order to extract specific types of results, for example crypto, gainers or losers, we can do it with argparse built-in library by creating command-line arguments:

parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")

args = parser.parse_args()
Enter fullscreen mode Exit fullscreen mode

Then we can run the script something like so:

$ python main.py -cc # will parse crypto results
Enter fullscreen mode Exit fullscreen mode

Note if action="store_true" is not used, the result will be an error:

$ python main.py -cc

Google Finance Markets Options: error: argument -cc/--crypto: expected one argument
Enter fullscreen mode Exit fullscreen mode

The action set to store_true will store the argument as True, if present. So if the argument is present, it will return some output.

We can also make parameters as requried, which means that certain parameter is required to be used when file is used.

Code Explanation
add_argument Defines how a single command-line argument should be parsed.
parse_args determines what objects are created by add_argument and how they are assigned. Returns the populated namespace.

The next step is to create function with all the command-line logic. You can access command-line arguments with dot notation:

def main():

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    # https://www.whatismybrowser.com/detect/what-is-my-user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
    }

    if args.indexes:
        html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
        return parser(html=html)

    # ... other arguments logic
Enter fullscreen mode Exit fullscreen mode
Code Explanation
user-agent to act as a "real" user request from the browser by passing it to request headers. This is used to bypass blocks from Google as default requests user-agent is python-requests and websites understand that it's a bot that sends a request and might block it. Check what's your user-agent.
if args.indexes will check if certain command-line argument is being passed.
timeout=30 to tell requests to stop waiting for response after 30 seconds.
return parser(html=html) to return data from the parser() function to reduce the code size since every selector is identical to extract the data.

The next step is to make a parser function that will extract all the needed data from the page. The function requires html argument that will be passed to parsel, then we need to create how the data will be structured once its parsed:

def parser(html):
    selector = Selector(text=html.text)
    stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")

    data = {
        f"{stock_topic}_trends": [],
        f"{stock_topic}_discover_more": [],
        f"{stock_topic}_news": []
    }
Enter fullscreen mode Exit fullscreen mode
Code Explanation
Selector(text=html.text) where passed HTML from the response will be processed by parsel.
text= is a parsel argument that accepts str object from where HTML nodes will be extracted.
css() to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood.
::text or ::attr(<attribute>) to extract textual or attribute data from the node.
get() to get actual data returned from parsel
split() to split a string into a list where each word is a list item
replace("<something>", "<with_something>") to replace something old with something new in a string.

After creating an empty dictionary structure, we need to fill it with news, stocks and other data by appending it, as a dict in this case:

# news ressults
for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
    data[f"{stock_topic}_news"].append({
        "position": index,
        "title": news_results.css(".mRjSYb::text").get(),
        "source": news_results.css(".sfyJob::text").get(),
        "date": news_results.css(".Adak::text").get(),
        "image": news_results.css("img::attr(src)").get(),
    })

# stocks table
for index, stock_results in enumerate(selector.css("li a"), start=1):
    current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

    # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
    quote = stock_results.attrib["href"].replace("./quote/", "")

    data[f"{stock_topic}_trends"].append({
        "position": index,
        "title": stock_results.css(".ZvmM7::text").get(),
        "quote": stock_results.css(".COaKTb::text").get(),
        # "https://www.google.com/finance/MSFT:NASDAQ"
        "quote_link": f"https://www.google.com/finance/{quote}",
        "price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })

# "you may be interested in" at the bottom of the page
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
    current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

    # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
    quote = stock_results.attrib["href"].replace("./quote/", "")

    data[f"{stock_topic}_discover_more"].append({
        "position": index,
        "quote": interested_bottom.css(".COaKTb::text").get(),
        "quote_link": f"https://www.google.com/finance{quote}",
        "title": interested_bottom.css(".RwFyvf::text").get(),
        "price": interested_bottom.css(".YMlKec::text").get(),
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })
Enter fullscreen mode Exit fullscreen mode
Code Explanation
enumerate() to add a counter to an iterable and return it. start=1 will start counting from 1, instead from the default value of 0.
::text or ::attr(<attribute>) to extract textual or attribute data from the node.
data[f"{stock_topic}_news"] dynamically appends data as a dict to whatever value is being extracted by a stock_topic variable.
append() to append extracted data to the list as dictionary.
css() to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood.
get() to get actual data.
getall() to get all a list of matches.
[jsname=Fe7oBc] is a CSS selector that used to select elements with the specified attribute and value e.g. [attribute=value].
attrib["href"] is a parsel method of accessing node attribute. It returns dict for the first matched element. None if dict is empty.
replace("<something>", "<with_something>") to replace something old with something new in a string.
re.search() to match parts of the string and grab only digit values. group() to return matched string by a regular expression.

Return the data:

# data = {
#     f"{stock_topic}_trends": [],
#     f"{stock_topic}_discover_more": [],
#     f"{stock_topic}_news": []
# }

# extraction code...

return data
Enter fullscreen mode Exit fullscreen mode

And finally, we need to specify that this is a runnable script for code readers:

if __name__ == "__main__":
    print(json.dumps(main(), indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

Now you can run your script from the command-line:

$ python main.py -ma # most-active
Enter fullscreen mode Exit fullscreen mode

You can also access help command -h to see available arguments like so:

$ python main.py -h
usage: Google Finance Markets Options [-h] [-i] [-ma] [-g] [-l]
                                      [-cl] [-cc] [-c]

optional arguments:
  -h, --help            show this help message and exit
  -i, --indexes
  -ma, --most-active
  -g, --gainers
  -l, --losers
  -cl, --climate-leaders
  -cc, --crypto
  -c, --currency
Enter fullscreen mode Exit fullscreen mode

Full output:

{
  "most_active_trends": [
    {
      "position": 1,
      "title": "Advanced Micro Devices, Inc.",
      "quote": "AMD",
      "quote_link": "https://www.google.com/finance/AMD:NASDAQ",
      "price_change": "+$3.04",
      "percent_price_change": "+3.22%"
    }, ... other results
    {
      "position": 50,
      "title": "Freeport-McMoRan Inc",
      "quote": "FCX",
      "quote_link": "https://www.google.com/finance/FCX:NYSE",
      "price_change": "-$1.15",
      "percent_price_change": "-3.66%"
    }
  ],
  "most_active_discover_more": [
    {
      "position": 1,
      "quote": "Index",
      "quote_link": "https://www.google.com/financeFCX:NYSE",
      "title": "Dow Jones Industrial Average",
      "price": "32,772.36",
      "percent_price_change": "-0.22%"
    }, ... other results
    {
      "position": 18,
      "quote": "NFLX",
      "quote_link": "https://www.google.com/financeFCX:NYSE",
      "title": "Netflix Inc",
      "price": "$226.14",
      "percent_price_change": "+0.55%"
    }
  ],
  "most_active_news": [
    {
      "position": 1,
      "title": "Alibaba says will work to keep trading in U.S., Hong Kong after being added \nto SEC delisting risk list",
      "source": "CNBC",
      "date": "7 hours ago",
      "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRMBjVDpAgK8AJP6gxfd89Kb5rz7th_s3ntTLA_WYWnVWT3Q05aQJTWpMpjcOg"
    }, ... other news results
    {
      "position": 6,
      "title": "Intel CEO: 'This is a time for a bit of austerity'",
      "source": "Yahoo Finance",
      "date": "4 hours ago",
      "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTxkwNmmHmcXqkF3-pa2Bl0SsCzdIJyB0jPdutL0vw9pV4sRkgy8BKemYIkEeg"
    }
  ]
}
Enter fullscreen mode Exit fullscreen mode

Originally published at SerpApi: https://serpapi.com/blog/scrape-google-finance-markets-in-python/

Join us on Twitter | YouTube

Add a Feature RequestπŸ’« or a Bug🐞

Top comments (0)

🌚 Browsing with dark mode makes you a better developer by a factor of exactly 40.

It's a scientific fact.