- What will be scraped
- Full Code
- Preparation
- Code Explanation
- Using Google Events API from SerpApi
- Links
What will be scraped
Full Code
If you don't need explanation, have a look at full code example in the online IDE.
import time, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from parsel import Selector
def scroll_page(url):
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--lang=en')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
old_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.UbEfxe').scrollHeight;
}
return getHeight();
""")
while True:
driver.execute_script('document.querySelector(".UbEfxe").scrollTo(0, document.querySelector(".UbEfxe").scrollHeight);')
time.sleep(1)
new_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.UbEfxe').scrollHeight;
}
return getHeight();
""")
if new_height == old_height:
break
old_height = new_height
selector = Selector(driver.page_source)
driver.quit()
return selector
def scrape_google_events(selector):
data = []
for event in selector.css('.scm-c'):
title = event.css('.dEuIWb::text').get()
date_start = f"{event.css('.FTUoSb::text').get()} {event.css('.omoMNe::text').get()}"
date_when = event.css('.Gkoz3::text').get()
address = [part.css('::text').get() for part in event.css('.ov85De span')]
link = event.css('.zTH3xc::attr(href)').get()
location_image = 'https://www.google.com' + event.css('.lu_vs::attr(data-bsrc)').get()
location_link = 'https://www.google.com' + event.css('.ozQmAd::attr(data-url)').get()
description = event.css('.PVlUWc::text').get()
ticket_info = [
{
'source': ticket.css('::attr(data-domain)').get(),
'link': ticket.css('.SKIyM::attr(href)').get(),
'link_type': ticket.css('.uaYYHd::text').get(),
}
for ticket in event.css('.RLN0we[jsname="CzizI"] div[data-domain]')
]
venue_name = event.css('.RVclrc::text').get()
venue_rating = float(event.css('.UIHjI::text').get()) if event.css('.UIHjI::text').get() else None
venue_reviews = int(event.css('.z5jxId::text').get().replace(',', '').split()[0]) if event.css('.z5jxId::text').get() else None
venue_link = 'https://www.google.com' + event.css('.pzNwRe a::attr(href)').get() if event.css('.pzNwRe a::attr(href)').get() else None
data.append({
'title': title,
'date':{
'start_date': date_start,
'when': date_when
},
'address': address,
'link': link,
'event_location_map': {
'image': location_image,
'link': location_link
},
'description': description,
'ticket_info': ticket_info,
'venue': {
'name': venue_name,
'rating': venue_rating,
'reviews': venue_reviews,
'link': venue_link
}
})
return data
def main():
params = {
'q': 'Events in Austin', # search query
'ibp': 'htl;events', # Google Events page
'hl': 'en', # language
'gl': 'US' # country of the search
}
URL = f'https://www.google.com/search?q={params["q"]}&ibp={params["ibp"]}&hl={params["hl"]}&gl={params["gl"]}'
result = scroll_page(URL)
google_events = scrape_google_events(result)
print(json.dumps(google_events, indent=2, ensure_ascii=False))
if __name__ == '__main__':
main()
Preparation
Install libraries:
pip install parsel selenium webdriver webdriver_manager
Reduce the chance of being blocked
Make sure you're using request headers user-agent
to act as a "real" user visit. Because default requests
user-agent
is python-requests
and websites understand that it's most likely a script that sends a request. Check what's your user-agent
.
There's a how to reduce the chance of being blocked while web scraping blog post that can get you familiar with basic and more advanced approaches.
Code Explanation
Import libraries:
import time, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from parsel import Selector
Library | Purpose |
---|---|
json |
to convert extracted data to a JSON object. |
time |
to work with time in Python. |
webdriver |
to drive a browser natively, as a user would, either locally or on a remote machine using the Selenium server. |
Service |
to manage the starting and stopping of the ChromeDriver. |
Selector |
XML/HTML parser that have full XPath and CSS selectors support. |
Top-level code environment
At the beginning of the function, parameters are defined for generating the URL
. If you want to pass other parameters to the URL, you can do so using the params
dictionary.
Next, the URL is passed to the scroll_page(URL)
function to scroll the page and get all data. The result that this function returns is passed to the scrape_google_events(result)
function to extract the necessary data. The explanation of these functions will be in the corresponding headings below.
This code uses the generally accepted rule of using the __name__ == "__main__"
construct:
def main():
params = {
'q': 'Events in Austin', # search query
'ibp': 'htl;events', # Google Events page
'hl': 'en', # language
'gl': 'US' # country of the search
}
URL = f'https://www.google.com/search?q={params["q"]}&ibp={params["ibp"]}&hl={params["hl"]}&gl={params["gl"]}'
result = scroll_page(URL)
google_events = scrape_google_events(result)
print(json.dumps(google_events, indent=2, ensure_ascii=False))
if __name__ == '__main__':
main()
This check will only be performed if the user has run this file. If the user imports this file into another, then the check will not work.
You can watch the video Python Tutorial: if name == 'main' for more details.
Scroll page
The function takes the URL and returns a full HTML structure.
First, let's understand how pagination works on the Google Events page. Data does not load immediately. If the user needs more data, they will simply scroll through the section where the list of events is presented and site download a small package of data. Accordingly, to get all the data, you need to scroll to the end of the list of events.
In this case, selenium
library is used, which allows you to simulate user actions in the browser. For selenium
to work, you need to use ChromeDriver
, which can be downloaded manually or using code. In our case, the second method is used. To control the start and stop of ChromeDriver
, you need to use Service
which will install browser binaries under the hood:
service = Service(ChromeDriverManager().install())
You should also add options
to work correctly:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--lang=en')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36')
options.add_argument('--no-sandbox')
Chrome options | Explanation |
---|---|
--headless |
to run Chrome in headless mode. |
--lang=en |
to set the browser language to English. |
user-agent |
to act as a "real" user request from the browser by passing it to request headers. Check what's your user-agent . |
--no-sandbox |
to make chromedriver work properly on different machines. |
Now we can start webdriver
and pass the url to the get()
method.
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
The page scrolling algorithm looks like this:
- Find out the initial page height and write the result to the
old_height
variable. - Scroll the page using the script and wait 1 second for the data to load.
- Find out the new page height and write the result to the
new_height
variable. - If the variables
new_height
andold_height
are equal, then we complete the algorithm, otherwise we write the value of the variablenew_height
to the variableold_height
and return to step 2.
Getting the page height and scroll is done by pasting the JavaScript code into the execute_script()
method.
old_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.UbEfxe').scrollHeight;
}
return getHeight();
""")
while True:
driver.execute_script('document.querySelector(".UbEfxe").scrollTo(0, document.querySelector(".UbEfxe").scrollHeight);')
time.sleep(1)
new_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.UbEfxe').scrollHeight;
}
return getHeight();
""")
if new_height == old_height:
break
old_height = new_height
Now we need to process HTML using from Parsel
package, in which we pass the HTML
structure with all the data that was received after scrolling the page. This is necessary to successfully retrieve data in the next function. After all the operations are done, stop the driver:
selector = Selector(driver.page_source)
driver.quit()
The function looks like this:
def scroll_page(url):
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--lang=en')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(service=service, options=options)
driver.get(url)
old_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.UbEfxe').scrollHeight;
}
return getHeight();
""")
while True:
driver.execute_script('document.querySelector(".UbEfxe").scrollTo(0, document.querySelector(".UbEfxe").scrollHeight);')
time.sleep(1)
new_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.UbEfxe').scrollHeight;
}
return getHeight();
""")
if new_height == old_height:
break
old_height = new_height
selector = Selector(driver.page_source)
driver.quit()
return selector
In the gif below, I demonstrate how this function works:
Scrape Google Events
This function takes a full HTML structure and returns a list with the extracted data.
Declaring the data
list where the extracted data will be added:
data = []
To extract the necessary data, you need to find the selector where they are located. In our case, this is the .scm-c
selector, which contains all events. You need to iterate each event in the loop:
for event in selector.css('.scm-c'):
# data extraction will be here
Data such as title
, date_when
, link
, description
and venue_name
are easily retrieved:
title = event.css('.dEuIWb::text').get()
date_when = event.css('.Gkoz3::text').get()
link = event.css('.zTH3xc::attr(href)').get()
description = event.css('.PVlUWc::text').get()
venue_name = event.css('.RVclrc::text').get()
When location_image
and location_link
are retrieved, only part of the link is returned as a result. So you need to add https://www.google.com
to form a full link:
location_image = 'https://www.google.com' + event.css('.lu_vs::attr(data-bsrc)').get()
location_link = 'https://www.google.com' + event.css('.ozQmAd::attr(data-url)').get()
Extracting date_start
differs from the previous ones in that you need to extract the day and month separately:
date_start = f"{event.css('.FTUoSb::text').get()} {event.css('.omoMNe::text').get()}"
The address
and ticket_info
lists contain multiple elements in their selector, so they are extracted using list comprehensions. I want to draw your attention to the fact that dictionaries with all the necessary data are added to ticket_info
:
address = [part.css('::text').get() for part in event.css('.ov85De span')]
ticket_info = [
{
'source': ticket.css('::attr(data-domain)').get(),
'link': ticket.css('.SKIyM::attr(href)').get(),
'link_type': ticket.css('.uaYYHd::text').get(),
}
for ticket in event.css('.RLN0we[jsname="CzizI"] div[data-domain]')
]
Data such as venue_rating
, venue_reviews
and venue_link
may not be present in every event. Therefore, when extracting them, a ternary operator was used:
venue_rating = float(event.css('.UIHjI::text').get()) if event.css('.UIHjI::text').get() else None
venue_reviews = int(event.css('.z5jxId::text').get().replace(',', '').split()[0]) if event.css('.z5jxId::text').get() else None
venue_link = 'https://www.google.com' + event.css('.pzNwRe a::attr(href)').get() if event.css('.pzNwRe a::attr(href)').get() else None
After extracting all the data, a dictionary is formed, which is subsequently appended to the data
list:
data.append({
'title': title,
'date':{
'start_date': date_start,
'when': date_when
},
'address': address,
'link': link,
'event_location_map': {
'image': location_image,
'link': location_link
},
'description': description,
'ticket_info': ticket_info,
'venue': {
'name': venue_name,
'rating': venue_rating,
'reviews': venue_reviews,
'link': venue_link
}
})
The complete function to scrape all data would look like this:
def scrape_google_events(selector):
data = []
for event in selector.css('.scm-c'):
title = event.css('.dEuIWb::text').get()
date_start = f"{event.css('.FTUoSb::text').get()} {event.css('.omoMNe::text').get()}"
date_when = event.css('.Gkoz3::text').get()
address = [part.css('::text').get() for part in event.css('.ov85De span')]
link = event.css('.zTH3xc::attr(href)').get()
location_image = 'https://www.google.com' + event.css('.lu_vs::attr(data-bsrc)').get()
location_link = 'https://www.google.com' + event.css('.ozQmAd::attr(data-url)').get()
description = event.css('.PVlUWc::text').get()
ticket_info = [
{
'source': ticket.css('::attr(data-domain)').get(),
'link': ticket.css('.SKIyM::attr(href)').get(),
'link_type': ticket.css('.uaYYHd::text').get(),
}
for ticket in event.css('.RLN0we[jsname="CzizI"] div[data-domain]')
]
venue_name = event.css('.RVclrc::text').get()
venue_rating = float(event.css('.UIHjI::text').get()) if event.css('.UIHjI::text').get() else None
venue_reviews = int(event.css('.z5jxId::text').get().replace(',', '').split()[0]) if event.css('.z5jxId::text').get() else None
venue_link = 'https://www.google.com' + event.css('.pzNwRe a::attr(href)').get() if event.css('.pzNwRe a::attr(href)').get() else None
data.append({
'title': title,
'date':{
'start_date': date_start,
'when': date_when
},
'address': address,
'link': link,
'event_location_map': {
'image': location_image,
'link': location_link
},
'description': description,
'ticket_info': ticket_info,
'venue': {
'name': venue_name,
'rating': venue_rating,
'reviews': venue_reviews,
'link': venue_link
}
})
return data
Code | Explanation |
---|---|
css() |
to access elements by the passed selector. |
::text or ::attr(<attribute>) |
to extract textual or attribute data from the node. |
get() |
to actually extract the textual data. |
Output:
[
{
"title": "Lit @ Haute Spot",
"date": {
"start_date": "8 Nov",
"when": "Tue, Nov 8, 5 โ 10 PM CST"
},
"address": [
"Haute Spot Event Venue",
"1501 E New Hope Dr, Cedar Park, TX"
],
"link": "https://m.facebook.com/events/haute-spot/hoobastank-lit-tried-true-tour-w-alien-ant-farm-kris-roe-of-the-ataris-at-haute-/3231075087211137/",
"event_location_map": {
"image": "https://www.google.com/maps/vt/data=Mi6idsSxpVUEprl-uYK-yKdNGzRAj_h_XnRilW9maYmoOx-D3KFQZQtRsuMJ7Crgf7ivGpAkEOk-4oo7z-CekWI8TiSUsWbQJ5bb2Td2OYPZ45b4S6s",
"link": "https://www.google.com/maps/place//data=!4m2!3m1!1s0x865b2d2f76d2be53:0x9aee4d078a09a6cd?sa=X&hl=en&gl=USl"
},
"description": "INCOMING: Hoobstank & Lit will bring their Tried & True Tour to Haute Spot in Cedar Park, TX on Tuesday, Nov. 8! Alien Ant Farm & Kris Roe (of The Ataris) will open the show! Full concert schedule...",
"ticket_info": [
{
"source": "Closeseats.com",
"link": "https://www.closeseats.com/cedar-park/alternative/hoobastank-and-lit-tickets/5285105",
"link_type": "TICKETS"
},
{
"source": "Feefreeticket.com",
"link": "https://www.feefreeticket.com/hoobastank-and-lit-haute-spot/5285105",
"link_type": "TICKETS"
},
{
"source": "Bigtowntickets.com",
"link": "https://www.bigtowntickets.com/Events/Alternative-Tickets/Hoobastank-and-Lit-2022-11-08-18-00-00",
"link_type": "TICKETS"
},
{
"source": "Ticketsource.com",
"link": "https://www.ticketsource.com/2135448/Hoobastank",
"link_type": "TICKETS"
},
{
"source": "Facebook",
"link": "https://m.facebook.com/events/haute-spot/hoobastank-lit-tried-true-tour-w-alien-ant-farm-kris-roe-of-the-ataris-at-haute-/3231075087211137/",
"link_type": "MORE INFO"
}
],
"venue": {
"name": "Haute Spot Event Venue",
"rating": 4.4,
"reviews": 349,
"link": "https://www.google.com/search?hl=en&gl=USl&q=Haute+Spot+Event+Venue&ludocid=11163945221074036429&ibp=gwp%3B0,7"
}
},
... other results
]
Using Google Events API from SerpApi
This section is to show the comparison between the DIY solution and our solution.
The main difference is that it's a quicker approach. Google Events API will bypass blocks from search engines and you don't have to create the parser from scratch and maintain it.
First, we need to install google-search-results
:
pip install google-search-results
Import the necessary libraries for work:
from serpapi import GoogleSearch
import os, json
Next, we write a search query and the necessary parameters for making a request:
params = {
# https://docs.python.org/3/library/os.html#os.getenv
'api_key': os.getenv('API_KEY'), # your serpapi api
'q': 'Events in Austin', # search query
'engine': 'google_events', # SerpApi search engine
'hl': 'en', # language
'gl': 'us', # country of the search
'start': 0 # pagination
}
Declaring the google_events_results
list where the extracted data will be added:
google_events_results = []
Since we want to extract all the data, we need to use the 'start'
parameter, which is responsible for pagination.
Let's implement an infinite loop that will increase the value of the 'start'
parameter by 10 on each iteration. This will continue as long as there is something to extract:
while True:
search = GoogleSearch(params) # where data extraction happens on the SerpApi backend
result_dict = search.get_dict() # JSON -> Python dict
if 'error' in result_dict:
break
# data extraction will be here
params['start'] += 10
The data is retrieved quite simply, we just need to turn to the 'events_results'
key.
for result in result_dict['events_results']:
google_events_results.append(result)
Example code to integrate:
from serpapi import GoogleSearch
import os, json
params = {
# https://docs.python.org/3/library/os.html#os.getenv
'api_key': os.getenv('API_KEY'), # your serpapi api
'q': 'Events in Austin', # search query
'engine': 'google_events', # SerpApi search engine
'hl': 'en', # language
'gl': 'us', # country of the search
'start': 0 # pagination
}
google_events_results = []
while True:
search = GoogleSearch(params) # where data extraction happens on the SerpApi backend
result_dict = search.get_dict() # JSON -> Python dict
if 'error' in result_dict:
break
for result in result_dict['events_results']:
google_events_results.append(result)
params['start'] += 10
print(json.dumps(google_events_results, indent=2, ensure_ascii=False))
Output:
[
{
"title": "Lit @ Haute Spot",
"date": {
"start_date": "Nov 8",
"when": "Tue, Nov 8, 5 โ 10 PM CST"
},
"address": [
"Haute Spot Event Venue, 1501 E New Hope Dr",
"Cedar Park, TX"
],
"link": "https://m.facebook.com/events/haute-spot/hoobastank-lit-tried-true-tour-w-alien-ant-farm-kris-roe-of-the-ataris-at-haute-/3231075087211137/",
"event_location_map": {
"image": "https://www.google.com/maps/vt/data=Mi6idsSxpVUEprl-uYK-yKdNGzRAj_h_XnRilW9maYmoOx-D3KFQZQtRsuMJ7Crgf7ivGpAkEOk-4oo7z-CekWI8TiSUsWbQJ5bb2Td2OYPZ45b4S6s",
"link": "https://www.google.com/maps/place//data=!4m2!3m1!1s0x865b2d2f76d2be53:0x9aee4d078a09a6cd?sa=X",
"serpapi_link": "https://serpapi.com/search.json?data=%214m2%213m1%211s0x865b2d2f76d2be53%3A0x9aee4d078a09a6cd&engine=google_maps&google_domain=google.com&hl=en&q=Events+in+Austin&start=0&type=place"
},
"description": "INCOMING: Hoobstank & Lit will bring their Tried & True Tour to Haute Spot in Cedar Park, TX on Tuesday, Nov. 8! Alien Ant Farm & Kris Roe (of The Ataris) will open the show! Full concert schedule...",
"ticket_info": [
{
"source": "Closeseats.com",
"link": "https://www.closeseats.com/cedar-park/alternative/hoobastank-and-lit-tickets/5285105",
"link_type": "tickets"
},
{
"source": "Feefreeticket.com",
"link": "https://www.feefreeticket.com/hoobastank-and-lit-haute-spot/5285105",
"link_type": "tickets"
},
{
"source": "Bigtowntickets.com",
"link": "https://www.bigtowntickets.com/Events/Alternative-Tickets/Hoobastank-and-Lit-2022-11-08-18-00-00",
"link_type": "tickets"
},
{
"source": "Ticketsource.com",
"link": "https://www.ticketsource.com/2135448/Hoobastank",
"link_type": "tickets"
},
{
"source": "Facebook",
"link": "https://m.facebook.com/events/haute-spot/hoobastank-lit-tried-true-tour-w-alien-ant-farm-kris-roe-of-the-ataris-at-haute-/3231075087211137/",
"link_type": "more info"
}
],
"venue": {
"name": "Haute Spot Event Venue",
"rating": 4.4,
"reviews": 349,
"link": "https://www.google.com/search?q=Haute+Spot+Event+Venue&ludocid=11163945221074036429&ibp=gwp%3B0,7"
},
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTYdxz7IF7Vmp9CMyaqElXJbl7oqMaSSTrtGaXFa4U&s",
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRTjb5umquoSVIOAaLDN06mjBdTz-6OzVCACDu43lSXmA&s=10"
},
... other results
]
Links
Add a Feature Request๐ซ or a Bug๐
Top comments (0)