DEV Community

Cover image for Scrape Google Play Apps with Python
Artur Chukhrai for SerpApi

Posted on • Updated on

Scrape Google Play Apps with Python

What will be scraped

wwbs-google-play-apps

πŸ“ŒNote: Google Play gives different results for logged in and not logged in users.

Full Code

If you don't need explanation, have a look at full code example in the online IDE.

import time, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from parsel import Selector

google_play_apps = {
    'Top charts': {
        'Top free': [],
        'Top grossing': [],
        'Top paid': []
    },
}


def scroll_page(url):
    service = Service(ChromeDriverManager().install())

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--lang=en")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)

    while True:
        try:
            driver.execute_script("document.querySelector('.snByac').click();")
            WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))
            break
        except:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))

    scrape_top_charts(driver=driver, chart='Top free', button_selector='#ct\|apps_topselling_free .ypTNYd')
    scrape_top_charts(driver=driver, chart='Top grossing', button_selector='#ct\|apps_topgrossing .ypTNYd')
    scrape_top_charts(driver=driver, chart='Top paid', button_selector='#ct\|apps_topselling_paid .ypTNYd')

    selector = Selector(driver.page_source)
    driver.quit()

    return selector


def scrape_top_charts(driver, chart, button_selector):
    button = driver.find_element(By.CSS_SELECTOR, button_selector)
    driver.execute_script("arguments[0].click();", button)
    time.sleep(2)
    selector = Selector(driver.page_source)

    for result in selector.css('.itIJzb'):
        title = result.css('.OnEJge::text').get()
        link = 'https://play.google.com' + result.css('::attr(href)').get()
        category = result.css('.ubGTjb .sT93pb.w2kbF:not(.K4Wkre)::text').get()
        rating = float(result.css('.CKzsaf .w2kbF::text').get())
        thumbnail = result.css('.stzEZd::attr(srcset)').get().replace(' 2x', '')

        google_play_apps['Top charts'][chart].append({
            'title': title,
            'link': link,
            'category': category,
            'rating': rating,
            'thumbnail': thumbnail,
        })


def scrape_all_sections(selector):  
    for section in selector.css('section'):
        section_title = section.css('.kcen6d span::text').get()
        google_play_apps[section_title] = []

        for app in section.css('.UVEnyf'):
            title = app.css('.Epkrse::text').get()
            link = 'https://play.google.com' + app.css('.Si6A0c::attr(href)').get()
            rating = app.css('.LrNMN::text').get()
            rating = float(rating) if rating else rating
            thumbnail = app.css('.Q8CSx::attr(srcset)').get().replace(' 2x', '')

            google_play_apps[section_title].append({
                'title': title,
                'link': link,
                'rating': rating,
                'thumbnail': thumbnail,
            })

    print(json.dumps(google_play_apps, indent=2, ensure_ascii=False))


def scrape_google_play_apps():
    params = {
        'device': 'phone',  
        'hl': 'en_GB',      # language 
        'gl': 'US',         # country of the search
    }

    URL = f"https://play.google.com/store/apps?device={params['device']}&hl={params['hl']}&gl={params['gl']}"

    result = scroll_page(URL)
    scrape_all_sections(result)


if __name__ == "__main__":
    scrape_google_play_apps()
Enter fullscreen mode Exit fullscreen mode

Preparation

Install libraries:

pip install parsel selenium webdriver webdriver_manager
Enter fullscreen mode Exit fullscreen mode

Reduce the chance of being blocked

Make sure you're using request headers user-agent to act as a "real" user visit. Because default requests user-agent is python-requests and websites understand that it's most likely a script that sends a request. Check what's your user-agent.

There's a how to reduce the chance of being blocked while web scraping blog post that can get you familiar with basic and more advanced approaches.

Code Explanation

Import libraries:

import time, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from parsel import Selector
Enter fullscreen mode Exit fullscreen mode
Library Purpose
time to work with time in Python.
json to convert extracted data to a JSON object.
webdriver to drive a browser natively, as a user would, either locally or on a remote machine using the Selenium server.
Service to manage the starting and stopping of the ChromeDriver.
By to set of supported locator strategies (By.ID, By.TAG_NAME, By.XPATH etc).
WebDriverWait to wait only as long as required..
expected_conditions contains a set of predefined conditions to use with WebDriverWait.
Selector XML/HTML parser that have full XPath and CSS selectors support.

Define dictionary structure:

google_play_apps = {
    'Top charts': {
        'Top free': [],
        'Top grossing': [],
        'Top paid': []
    },
}
Enter fullscreen mode Exit fullscreen mode

Top-level code environment

At the beginning of the function, parameters are defined for generating the URL. If you want to pass other parameters to the URL, you can do so using the params dictionary. The parameters affect the output results:

params = {
    'device': 'phone',  
    'hl': 'en_GB',      # language 
    'gl': 'US',         # country of the search
}
Enter fullscreen mode Exit fullscreen mode

Next, the URL is passed to the scroll_page(URL) function to scroll the page and get all data. The result that this function returns is passed to the scrape_all_categories(result) function to extract the necessary data. The explanation of these functions will be in the corresponding headings below.

This code uses the generally accepted rule of using the __name__ == "__main__" construct:

def scrape_google_play_apps():
    params = {
        'device': 'phone',  
        'hl': 'en_GB',      # language 
        'gl': 'US',         # country of the search
    }

    URL = f"https://play.google.com/store/apps?device={params['device']}&hl={params['hl']}&gl={params['gl']}"

    result = scroll_page(URL)
    scrape_all_sections(result)


if __name__ == "__main__":
    scrape_google_play_apps()
Enter fullscreen mode Exit fullscreen mode

This check will only be performed if the user has run this file. If the user imports this file into another, then the check will not work.

You can watch the video Python Tutorial: if name == 'main' for more details.

Scroll page

The function takes the URL and returns a full HTML structure.

First, let's understand how pagination works on the Google Play Apps page. Data does not load immediately. If the user needs more data, they will simply scroll the page and site download a small package of data.

Accordingly, to get all the data, you need to scroll to the end of the page. But we will face the problem that on the last scroll the SHOW MORE button appears. By clicking on it, you will get the last piece of data. A page scroll demo is shown below:

google-play-apps-scroll

In this case, selenium library is used, which allows you to simulate user actions in the browser. For selenium to work, you need to use ChromeDriver, which can be downloaded manually or using code. In our case, the second method is used. To control the start and stop of ChromeDriver, you need to use Service which will install browser binaries under the hood:

service = Service(ChromeDriverManager().install())
Enter fullscreen mode Exit fullscreen mode

You should also add options to work correctly:

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--lang=en')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36')
options.add_argument('--no-sandbox')
Enter fullscreen mode Exit fullscreen mode
Chrome options Explanation
--headless to run Chrome in headless mode.
--lang=en to set the browser language to English.
user-agent to act as a "real" user request from the browser by passing it to request headers. Check what's your user-agent.
--no-sandbox to make chromedriver work properly on different machines.

Now we can start webdriver and pass the url to the get() method.

driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
Enter fullscreen mode Exit fullscreen mode

How does the page scrolling algorithm work? In each iteration of the loop, the program looks for the button selector. If the button is present, then the program is clicking on it and the data is loaded, after which the loop ends. Otherwise, the page scrolls down and the data loads.

Scrolling a page and clicking on a button is done by pasting the JavaScript code into the execute_script() method.

while True:
    try:
        driver.execute_script("document.querySelector('.snByac').click();")
        WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))
        break
    except:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))
Enter fullscreen mode Exit fullscreen mode

Sometimes it is difficult to calculate how long it will take to load a page, it all depends on the speed of the Internet, the power of the computer and other factors. The method described below is much better than using a delay in seconds since the wait occurs exactly until the moment when the page is fully loaded:

WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))
Enter fullscreen mode Exit fullscreen mode

πŸ“ŒNote: In this case, we give 10 seconds for the page to load, if it loads earlier then the wait will end.

After all the data has been loaded, you need to pass them to the scrape_top_charts function. This function will be described in the relevant section below. It is important to extract the data before we stop the driver.

scrape_top_charts(driver=driver, chart='Top free', button_selector='#ct\|apps_topselling_free .ypTNYd')
scrape_top_charts(driver=driver, chart='Top grossing', button_selector='#ct\|apps_topgrossing .ypTNYd')
scrape_top_charts(driver=driver, chart='Top paid', button_selector='#ct\|apps_topselling_paid .ypTNYd')
Enter fullscreen mode Exit fullscreen mode

Now we need to process HTML using from Parsel package, in which we pass the HTML structure with all the data that was received after scrolling the page. This is necessary to successfully retrieve data in the next function. After all the operations are done, stop the driver:

selector = Selector(driver.page_source)
driver.quit()
Enter fullscreen mode Exit fullscreen mode

The function looks like this:

def scroll_page(url):
    service = Service(ChromeDriverManager().install())

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--lang=en")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)

    while True:
        try:
            driver.execute_script("document.querySelector('.snByac').click();")
            WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))
            break
        except:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            WebDriverWait(driver, 10000).until(EC.visibility_of_element_located((By.TAG_NAME, 'body')))

    scrape_top_charts(driver=driver, chart='Top free', button_selector='#ct\|apps_topselling_free .ypTNYd')
    scrape_top_charts(driver=driver, chart='Top grossing', button_selector='#ct\|apps_topgrossing .ypTNYd')
    scrape_top_charts(driver=driver, chart='Top paid', button_selector='#ct\|apps_topselling_paid .ypTNYd')

    selector = Selector(driver.page_source)
    driver.quit()

    return selector
Enter fullscreen mode Exit fullscreen mode

Scrape top charts

The function takes 3 parameters:

  • driver - the full HTML structure.
  • chart - the chart from which the data will be retrieved.
  • button_selector - the selector of the button to go to the corresponding chart.

On the GIF, I show how the top charts work:

google-play-apps-top-charts

The corresponding data is loaded by pressing the corresponding buttons. The process of clicking the button and creating the object to retrieve data is the same as we did before in the scroll_page function:

button = driver.find_element(By.CSS_SELECTOR, button_selector)
driver.execute_script("arguments[0].click();", button)
time.sleep(2)
selector = Selector(driver.page_source)
Enter fullscreen mode Exit fullscreen mode

To extract the necessary data, you need to find the selector where they are located. In our case, this is the .itIJzb selector, which contains all applications. You need to iterate each application in the loop:

for result in selector.css('.itIJzb'):
    # data extraction will be here
Enter fullscreen mode Exit fullscreen mode

For each app, data such as title, link, category, rating and thumbnail are extracted. You need to find the matching selector and get the text or attribute value. I want to additionally note that the thumbnail is retrieved from the srcset attribute, where it is of better quality:

title = result.css('.OnEJge::text').get()
link = 'https://play.google.com' + result.css('::attr(href)').get()
category = result.css('.ubGTjb .sT93pb.w2kbF:not(.K4Wkre)::text').get()
rating = float(result.css('.CKzsaf .w2kbF::text').get())
thumbnail = result.css('.stzEZd::attr(srcset)').get().replace(' 2x', '')
Enter fullscreen mode Exit fullscreen mode

After extracting the data, they are added to the google_play_apps dictionary by key. The key is the value of the сhart argument:

google_play_apps['Top charts'][chart].append({
    'title': title,
    'link': link,
    'category': category,
    'rating': rating,
    'thumbnail': thumbnail,
})
Enter fullscreen mode Exit fullscreen mode

The complete function to scrape top charts would look like this:

def scrape_top_charts(driver, chart, button_selector):
    button = driver.find_element(By.CSS_SELECTOR, button_selector)
    driver.execute_script("arguments[0].click();", button)
    time.sleep(2)
    selector = Selector(driver.page_source)

    for result in selector.css('.itIJzb'):
        title = result.css('.OnEJge::text').get()
        link = 'https://play.google.com' + result.css('::attr(href)').get()
        category = result.css('.ubGTjb .sT93pb.w2kbF:not(.K4Wkre)::text').get()
        rating = float(result.css('.CKzsaf .w2kbF::text').get())
        thumbnail = result.css('.stzEZd::attr(srcset)').get().replace(' 2x', '')

        google_play_apps['Top charts'][chart].append({
            'title': title,
            'link': link,
            'category': category,
            'rating': rating,
            'thumbnail': thumbnail,
        })
Enter fullscreen mode Exit fullscreen mode
Code Explanation
css() to access elements by the passed selector.
::text or ::attr(<attribute>) to extract textual or attribute data from the node.
get() to actually extract the textual data.
float() to make a floating number from a string value.
replace() to replace all occurrences of the old substring with the new one without extra elements.

In the scroll_page function, we called this function 3 times to retrieve data from Top free, Top grossing and Top paid sections.

Scrape all sections

This function takes a full HTML structure and prints all results in JSON format.

To retrieve data from all sections, you need to find the section selector of the section. You need to iterate each section in the loop:

for section in selector.css('section'):
    # data extraction will be here
Enter fullscreen mode Exit fullscreen mode

It is necessary to extract the section_title and make it a key in the google_play_apps dictionary, where a list of extracted data for each application will be added later:

section_title = section.css('.kcen6d span::text').get()
google_play_apps[section_title] = []
Enter fullscreen mode Exit fullscreen mode

Each section has a certain number of applications that should also need to iterate in another loop using the .UVEnyf selector:

for app in section.css('.UVEnyf'):
    # data extraction will be here
Enter fullscreen mode Exit fullscreen mode

For applications that are in sections, there is no such parameter as categories. Data is also retrieved by other selectors.

Also, there is a slight difference in rating extraction. The fact is that for some apps the rating is not displayed. In this case, a ternary expression is used, which extracts the numerical values for the data, if any:

title = app.css('.Epkrse::text').get()
link = 'https://play.google.com' + app.css('.Si6A0c::attr(href)').get()
rating = app.css('.LrNMN::text').get()
rating = float(rating) if rating else rating
thumbnail = app.css('.Q8CSx::attr(srcset)').get().replace(' 2x', '')
Enter fullscreen mode Exit fullscreen mode

The complete function to scrape all sections would look like this:

def scrape_all_sections(selector):  
    for section in selector.css('section'):
        section_title = section.css('.kcen6d span::text').get()
        google_play_apps[section_title] = []

        for app in section.css('.UVEnyf'):
            title = app.css('.Epkrse::text').get()
            link = 'https://play.google.com' + app.css('.Si6A0c::attr(href)').get()
            rating = app.css('.LrNMN::text').get()
            rating = float(rating) if rating else rating
            thumbnail = app.css('.Q8CSx::attr(srcset)').get().replace(' 2x', '')

            google_play_apps[section_title].append({
                'title': title,
                'link': link,
                'rating': rating,
                'thumbnail': thumbnail,
            })

    print(json.dumps(google_play_apps, indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

Output:

{
  "Top charts": {
    "Top free": [
      {
        "title": "Disney+",
        "link": "https://play.google.com/store/apps/details?id=com.disney.disneyplus",
        "category": "Entertainment",
        "rating": 4.5,
        "thumbnail": "https://play-lh.googleusercontent.com/xoGGYH2LgLibLDBoxMg-ZE16b-RNfITw_OgXBWRAPin2FZY4FGB9QKBYApR-0rSCkQ=s128-rw"
      },
      ... other apps
    ],
    "Top grossing": [
      {
        "title": "Google One",
        "link": "https://play.google.com/store/apps/details?id=com.google.android.apps.subscriptions.red",
        "category": "Productivity",
        "rating": 4.3,
        "thumbnail": "https://play-lh.googleusercontent.com/DGAleS46qOedNzJGsB3e29QLpL6Qi6EwIDze95nBvxMAMGEmbE6KOW__2haEkHVDs4Y=s128-rw"
      },
      ... other apps
    ],
    "Top paid": [
      {
        "title": "Muscle Trigger Point Anatomy",
        "link": "https://play.google.com/store/apps/details?id=com.real.bodywork.muscle.trigger.points",
        "category": "Medical",
        "rating": 4.6,
        "thumbnail": "https://play-lh.googleusercontent.com/dX8bDLm4Aq0vF131uvjJO83EghJ9fIPIEfgLdcXwUXF7iZnpxkR53uy94H9FHocJRQ=s128-rw"
      },
      ... other apps
    ]
  },
  "Popular apps": [
    {
      "title": "WhatsApp Messenger",
      "link": "https://play.google.com/store/apps/details?id=com.whatsapp",
      "rating": 4.3,
      "thumbnail": "https://play-lh.googleusercontent.com/bYtqbOcTYOlgc6gqZ2rwb8lptHuwlNE75zYJu6Bn076-hTmvd96HH-6v7S0YUAAJXoJN=s512-rw"
    },
    ... other apps
  ],
  ... other sections
  "Book a getaway": [
    {
      "title": "Hotels.com: Book Hotels & More",
      "link": "https://play.google.com/store/apps/details?id=com.hcom.android",
      "rating": 4.4,
      "thumbnail": "https://play-lh.googleusercontent.com/onuxspmiR0fJZRWXZCToyBPht5yZE55drqWqoWWDj9YwJvKpg2AY4lt1LdymRYkRlh0=s512-rw"
    },
    ... other apps
  ]
}
Enter fullscreen mode Exit fullscreen mode

Using Google Play Apps Store API from SerpApi

This section is to show the comparison between the DIY solution and our solution.

The main difference is that it's a quicker approach. Google Play Apps Store API will bypass blocks from search engines and you don't have to create the parser from scratch and maintain it.

First, we need to install google-search-results:

pip install google-search-results
Enter fullscreen mode Exit fullscreen mode

Import the necessary libraries for work:

from serpapi import GoogleSearch
import os, json
Enter fullscreen mode Exit fullscreen mode

Next, we write a search query and the necessary parameters for making a request:

params = {
    # https://docs.python.org/3/library/os.html#os.getenv
    'api_key': os.getenv('API_KEY'),    # your serpapi api
    'engine': 'google_play',            # SerpApi search engine
    'store': 'apps'                     # Google Play Apps
}
Enter fullscreen mode Exit fullscreen mode

We then create a search object where the data is retrieved from the SerpApi backend. In the result_dict dictionary we get data from JSON:

search = GoogleSearch(params)
result_dict = search.get_dict()
Enter fullscreen mode Exit fullscreen mode

The data is retrieved quite simply, we just need to turn to the 'organic_results' key:

google_play_apps = result_dict['organic_results']
Enter fullscreen mode Exit fullscreen mode

Example code to integrate:

from serpapi import GoogleSearch
import os, json

params = {
    # https://docs.python.org/3/library/os.html#os.getenv
    'api_key': os.getenv('API_KEY'),    # your serpapi api
    'engine': 'google_play',            # SerpApi search engine
    'store': 'apps'                     # Google Play Apps
}

search = GoogleSearch(params)           # where data extraction happens on the SerpApi backend
result_dict = search.get_dict()         # JSON -> Python dict

google_play_apps = result_dict['organic_results']

print(json.dumps(google_play_apps, indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

Output:

[
  {
    "title": "Popular apps",
    "items": [
      {
        "title": "WhatsApp Messenger",
        "link": "https://play.google.com/store/apps/details?id=com.whatsapp",
        "product_id": "com.whatsapp",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_play_product&gl=us&hl=en&product_id=com.whatsapp&store=apps",
        "rating": 4.3,
        "thumbnail": "https://play-lh.googleusercontent.com/bYtqbOcTYOlgc6gqZ2rwb8lptHuwlNE75zYJu6Bn076-hTmvd96HH-6v7S0YUAAJXoJN=s256"
      },
      ... other items
      {
        "title": "Zoom - One Platform to Connect",
        "link": "https://play.google.com/store/apps/details?id=us.zoom.videomeetings",
        "product_id": "us.zoom.videomeetings",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_play_product&gl=us&hl=en&product_id=us.zoom.videomeetings&store=apps",
        "rating": 4.3,
        "thumbnail": "https://play-lh.googleusercontent.com/yZsmiNjmji3ZoOuLthoVvptLB9cZ0vCmitcky4OUXNcEFV3IEQkrBD2uu5kuWRF5_ERA=s256"
      }
    ]
  },
  ... other sections
  {
    "title": "Communication",
    "items": [
      {
        "title": "Google Voice",
        "link": "https://play.google.com/store/apps/details?id=com.google.android.apps.googlevoice",
        "product_id": "com.google.android.apps.googlevoice",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_play_product&gl=us&hl=en&product_id=com.google.android.apps.googlevoice&store=apps",
        "rating": 4.4,
        "thumbnail": "https://play-lh.googleusercontent.com/Gf8ufuFbtfXO5Y6JuZjnG0iIpZh21zNTqZ5aiAXO8mA38mvXzY-1s27FWbGlp51paQ=s256"
      },
      ... other items
      {
        "title": "Email - Fast & Secure Mail",
        "link": "https://play.google.com/store/apps/details?id=com.easilydo.mail",
        "product_id": "com.easilydo.mail",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_play_product&gl=us&hl=en&product_id=com.easilydo.mail&store=apps",
        "rating": 4.6,
        "thumbnail": "https://play-lh.googleusercontent.com/WvlTOWc59NcVtCfnxMjNZIG2mRHrjMNadTU7dnN2oA3UwT6tz70KodnLOwiimwc49Obx=s256"
      }
    ]
  }
]
Enter fullscreen mode Exit fullscreen mode

Join us on Twitter | YouTube

Add a Feature RequestπŸ’« or a Bug🐞

Top comments (0)