DEV Community

Artur Chukhrai
Artur Chukhrai

Posted on • Edited on

Scrape Google Flights with Python

What will be scraped

wwbs-google-flights

Full Code

from playwright.sync_api import sync_playwright
from selectolax.lexbor import LexborHTMLParser
import json, time


def get_page(playwright, from_place, to_place, departure_date, return_date):
    page = playwright.chromium.launch(headless=False).new_page()
    page.goto('https://www.google.com/travel/flights?hl=en-US&curr=USD')

    # type "From"
    from_place_field = page.query_selector_all('.e5F5td')[0]
    from_place_field.click()
    time.sleep(1)
    from_place_field.type(from_place)
    time.sleep(1)
    page.keyboard.press('Enter')

    # type "To"
    to_place_field = page.query_selector_all('.e5F5td')[1]
    to_place_field.click()
    time.sleep(1)
    to_place_field.type(to_place)
    time.sleep(1)
    page.keyboard.press('Enter')

    # type "Departure date"
    departure_date_field = page.query_selector_all('[aria-label="Departure"]')[0]
    departure_date_field.click()
    time.sleep(1)
    departure_date_field.type(departure_date)
    time.sleep(1)
    page.query_selector('.WXaAwc .VfPpkd-LgbsSe').click()
    time.sleep(1)

    # type "Return date"
    return_date_field = page.query_selector_all('[aria-label="Return"]')[0]
    return_date_field.click()
    time.sleep(1)
    return_date_field.type(return_date)
    time.sleep(1)
    page.query_selector('.WXaAwc .VfPpkd-LgbsSe').click()
    time.sleep(1)

    # press "Explore"
    page.query_selector('.MXvFbd .VfPpkd-LgbsSe').click()
    time.sleep(2)

    # press "More flights"
    page.query_selector('.zISZ5c button').click()
    time.sleep(2)

    parser = LexborHTMLParser(page.content())
    page.close()

    return parser


def scrape_google_flights(parser):
    data = {}

    categories = parser.root.css('.zBTtmb')
    category_results = parser.root.css('.Rk10dc')

    for category, category_result in zip(categories, category_results):
        category_data = []

        for result in category_result.css('.yR1fYc'):
            date = result.css('[jscontroller="cNtv4b"] span')
            departure_date = date[0].text()
            arrival_date = date[1].text()
            company = result.css_first('.Ir0Voe .sSHqwe').text()
            duration = result.css_first('.AdWm1c.gvkrdb').text()
            stops = result.css_first('.EfT7Ae .ogfYpf').text()
            emissions = result.css_first('.V1iAHe .AdWm1c').text()
            emission_comparison = result.css_first('.N6PNV').text()
            price = result.css_first('.U3gSDe .FpEdX span').text()
            price_type = result.css_first('.U3gSDe .N872Rd').text() if result.css_first('.U3gSDe .N872Rd') else None

            flight_data = {
                'departure_date': departure_date,
                'arrival_date': arrival_date,
                'company': company,
                'duration': duration,
                'stops': stops,
                'emissions': emissions,
                'emission_comparison': emission_comparison,
                'price': price,
                'price_type': price_type
            }

            airports = result.css_first('.Ak5kof .sSHqwe')
            service = result.css_first('.hRBhge')

            if service:
                flight_data['service'] = service.text()
            else:
                flight_data['departure_airport'] = airports.css_first('span:nth-child(1) .eoY5cb').text()
                flight_data['arrival_airport'] = airports.css_first('span:nth-child(2) .eoY5cb').text()

            category_data.append(flight_data)

        data[category.text().lower().replace(' ', '_')] = category_data

    return data


def run(playwright):
    from_place = 'Seattle'
    to_place = 'Las Vegas'
    departure_date = '14-6-2024'
    return_date = '20-6-2024'

    parser = get_page(playwright, from_place, to_place, departure_date, return_date)
    google_flights_results = scrape_google_flights(parser)

    print(json.dumps(google_flights_results, indent=2, ensure_ascii=False))


with sync_playwright() as playwright:
    run(playwright)
Enter fullscreen mode Exit fullscreen mode

Preparation

Install libraries:

pip install playwright selectolax
Enter fullscreen mode Exit fullscreen mode

Install the required browser:

playwright install chromium
Enter fullscreen mode Exit fullscreen mode

Code Explanation

Import libraries:

from playwright.sync_api import sync_playwright
from selectolax.lexbor import LexborHTMLParser
import json, time
Enter fullscreen mode Exit fullscreen mode
Library Purpose
sync_playwright for synchronous API. playwright have asynchronous API as well using asyncio module.
LexborHTMLParser a fast HTML5 parser with CSS selectors using Lexbor engine.
json to convert extracted data to a JSON object.
time to work with time in Python.

The next part of the code is divided into functions. Each function is described in the corresponding heading below.

Working with a Playwright

Declare a function:

def run(playwright):
    # further code ...
Enter fullscreen mode Exit fullscreen mode

Passing user data to form a request:

from_place = 'Seattle'
to_place = 'Las Vegas'
departure_date = '14-6-2024'
return_date = '20-6-2024'
Enter fullscreen mode Exit fullscreen mode

The playwright object and the previous data are then passed to the get_page(playwright, from_place, to_place, departure_date, return_date) function. The parser returned by this function is passed to the scrape_google_flights(parser) function to extract all the data. The explanation of these functions will be in the corresponding headings below.

parser = get_page(playwright, from_place, to_place, departure_date, return_date)
google_flights_results = scrape_google_flights(parser)
Enter fullscreen mode Exit fullscreen mode

After the all data is retrieved, it is output in JSON format:

print(json.dumps(google_flights_results, indent=2, ensure_ascii=False))
Enter fullscreen mode Exit fullscreen mode

Run your code using context manager:

with sync_playwright() as playwright:
    run(playwright)
Enter fullscreen mode Exit fullscreen mode

The whole part of the code looks like this:

def run(playwright):
    from_place = 'Seattle'
    to_place = 'Las Vegas'
    departure_date = '14-6-2024'
    return_date = '20-6-2024'

    parser = get_page(playwright, from_place, to_place, departure_date, return_date)
    google_flights_results = scrape_google_flights(parser)

    print(json.dumps(google_flights_results, indent=2, ensure_ascii=False))


with sync_playwright() as playwright:
    run(playwright)
Enter fullscreen mode Exit fullscreen mode

Get page

The function takes a playwright object and parameters from_place, to_place, departure_date, return_date. Returns a parser.

Initialize playwright, connect to chromium, launch() a browser new_page() and goto() to the URL:

page = playwright.chromium.launch(headless=False).new_page()
page.goto('https://www.google.com/travel/flights?hl=en-US&curr=USD')
Enter fullscreen mode Exit fullscreen mode
Parameters Explanation
playwright.chromium is a connection to the Chromium browser instance.
launch() will launch the browser, and headless argument will run it in headless mode. Default is True.
new_page() creates a new page in a new browser context.
page.goto() will make a request to provided website.

The next section of code is easier to show on the GIF:

get_page

In short, with the help of browser automation, we pass user parameters and generate search results:

# type "From"
from_place_field = page.query_selector_all('.e5F5td')[0]
from_place_field.click()
time.sleep(1)
from_place_field.type(from_place)
time.sleep(1)
page.keyboard.press('Enter')

# type "To"
to_place_field = page.query_selector_all('.e5F5td')[1]
to_place_field.click()
time.sleep(1)
to_place_field.type(to_place)
time.sleep(1)
page.keyboard.press('Enter')

# type "Departure date"
departure_date_field = page.query_selector_all('[aria-label="Departure"]')[0]
departure_date_field.click()
time.sleep(1)
departure_date_field.type(departure_date)
time.sleep(1)
page.query_selector('.WXaAwc .VfPpkd-LgbsSe').click()
time.sleep(1)

# type "Return date"
return_date_field = page.query_selector_all('[aria-label="Return"]')[0]
return_date_field.click()
time.sleep(1)
return_date_field.type(return_date)
time.sleep(1)
page.query_selector('.WXaAwc .VfPpkd-LgbsSe').click()
time.sleep(1)

# press "Explore"
page.query_selector('.MXvFbd .VfPpkd-LgbsSe').click()
time.sleep(2)

# press "More flights"
page.query_selector('.zISZ5c button').click()
time.sleep(2)
Enter fullscreen mode Exit fullscreen mode

After all the data has been loaded, you need to process HTML using from selectolax because it has Lexbor parser which is incredibly fast, like 186% faster compared to bs4 with lxml backend when parsing data with 3000 iterations 5 times. Please note that selectolax does not currently support XPath:

parser = LexborHTMLParser(page.content())
Enter fullscreen mode Exit fullscreen mode

After all the operations are done, close the browser and the parser is returned:

page.close()

return parser
Enter fullscreen mode Exit fullscreen mode

The function looks like this:

def get_page(playwright, from_place, to_place, departure_date, return_date):
    page = playwright.chromium.launch(headless=False).new_page()
    page.goto('https://www.google.com/travel/flights?hl=en-US&curr=USD')

    # type "From"
    from_place_field = page.query_selector_all('.e5F5td')[0]
    from_place_field.click()
    time.sleep(1)
    from_place_field.type(from_place)
    time.sleep(1)
    page.keyboard.press('Enter')

    # type "To"
    to_place_field = page.query_selector_all('.e5F5td')[1]
    to_place_field.click()
    time.sleep(1)
    to_place_field.type(to_place)
    time.sleep(1)
    page.keyboard.press('Enter')

    # type "Departure date"
    departure_date_field = page.query_selector_all('[aria-label="Departure"]')[0]
    departure_date_field.click()
    time.sleep(1)
    departure_date_field.type(departure_date)
    time.sleep(1)
    page.query_selector('.WXaAwc .VfPpkd-LgbsSe').click()
    time.sleep(1)

    # type "Return date"
    return_date_field = page.query_selector_all('[aria-label="Return"]')[0]
    return_date_field.click()
    time.sleep(1)
    return_date_field.type(return_date)
    time.sleep(1)
    page.query_selector('.WXaAwc .VfPpkd-LgbsSe').click()
    time.sleep(1)

    # press "Explore"
    page.query_selector('.MXvFbd .VfPpkd-LgbsSe').click()
    time.sleep(2)

    # press "More flights"
    page.query_selector('.zISZ5c button').click()
    time.sleep(2)

    parser = LexborHTMLParser(page.content())
    page.close()

    return parser
Enter fullscreen mode Exit fullscreen mode

Scrape Google Flights Listings

The function takes a Lexbor parser and returns the extracted data.

The data dictionary is declared to which the extracted data will be added:

data = {}
Enter fullscreen mode Exit fullscreen mode

In order to extract all the data, you need to extract the category names and the elements of these categories separately. You need to use the css() method and pass the appropriate selectors there:

categories = parser.root.css('.zBTtmb')
category_results = parser.root.css('.Rk10dc')
Enter fullscreen mode Exit fullscreen mode

You then need to iterate over the resulting item lists using the zip() method to successfully categorize the flights. For each category, a category_data dictionary is created, in which flights associated with a corresponding category will be added:

for category, category_result in zip(categories, category_results):
    category_data = []
Enter fullscreen mode Exit fullscreen mode

We iterate each flight to extract all the necessary data:

for result in category_result.css('.yR1fYc'):
    # data extraction will be here
Enter fullscreen mode Exit fullscreen mode

Most of the data is easily retrieved:

company = result.css_first('.Ir0Voe .sSHqwe').text()
duration = result.css_first('.AdWm1c.gvkrdb').text()
stops = result.css_first('.EfT7Ae .ogfYpf').text()
emissions = result.css_first('.V1iAHe .AdWm1c').text()
emission_comparison = result.css_first('.N6PNV').text()
price = result.css_first('.U3gSDe .FpEdX span').text()
price_type = result.css_first('.U3gSDe .N872Rd').text()
Enter fullscreen mode Exit fullscreen mode
Code Explanation
css_first() to find the desired element.
text() to retrieve the text content.

The selector responsible for the date contains 2 elements: departure_date and arrival_date.

date = result.css('[jscontroller="cNtv4b"] span')
departure_date = date[0].text()
arrival_date = date[1].text()
Enter fullscreen mode Exit fullscreen mode

After extracting the main data, we form the flight_data dictionary:

flight_data = {
    'departure_date': departure_date,
    'arrival_date': arrival_date,
    'company': company,
    'duration': duration,
    'stops': stops,
    'emissions': emissions,
    'emission_comparison': emission_comparison,
    'price': price,
    'price_type': price_type
}
Enter fullscreen mode Exit fullscreen mode

After adding the main data, you need to check the data, which may differ. These are the departure_airport and the arrival_airport or other service.

airports-or-service

Depending on what data is present in this flight, we add them to the flight_data dictionary. Then add this dictionary to the category_data list:

airports = result.css_first('.Ak5kof .sSHqwe')
service = result.css_first('.hRBhge')

if service:
    flight_data['service'] = service.text()
else:
    flight_data['departure_airport'] = airports.css_first('span:nth-child(1) .eoY5cb').text()
    flight_data['arrival_airport'] = airports.css_first('span:nth-child(2) .eoY5cb').text()

category_data.append(flight_data)
Enter fullscreen mode Exit fullscreen mode

When all flights have been retrieved for a specific category, you need to add them to the data dictionary by key. The key is the name of the category:

data[category.text().lower().replace(' ', '_')] = category_data
Enter fullscreen mode Exit fullscreen mode

At the end of the function, the data dictionary is returned:

return data
Enter fullscreen mode Exit fullscreen mode

The function looks like this:

def scrape_google_flights(parser):
    data = {}

    categories = parser.root.css('.zBTtmb')
    category_results = parser.root.css('.Rk10dc')

    for category, category_result in zip(categories, category_results):
        category_data = []

        for result in category_result.css('.yR1fYc'):
            date = result.css('[jscontroller="cNtv4b"] span')
            departure_date = date[0].text()
            arrival_date = date[1].text()
            company = result.css_first('.Ir0Voe .sSHqwe').text()
            duration = result.css_first('.AdWm1c.gvkrdb').text()
            stops = result.css_first('.EfT7Ae .ogfYpf').text()
            emissions = result.css_first('.V1iAHe .AdWm1c').text()
            emission_comparison = result.css_first('.N6PNV').text()
            price = result.css_first('.U3gSDe .FpEdX span').text()
            price_type = result.css_first('.U3gSDe .N872Rd').text() if result.css_first('.U3gSDe .N872Rd') else None

            flight_data = {
                'departure_date': departure_date,
                'arrival_date': arrival_date,
                'company': company,
                'duration': duration,
                'stops': stops,
                'emissions': emissions,
                'emission_comparison': emission_comparison,
                'price': price,
                'price_type': price_type
            }

            airports = result.css_first('.Ak5kof .sSHqwe')
            service = result.css_first('.hRBhge')

            if service:
                flight_data['service'] = service.text()
            else:
                flight_data['departure_airport'] = airports.css_first('span:nth-child(1) .eoY5cb').text()
                flight_data['arrival_airport'] = airports.css_first('span:nth-child(2) .eoY5cb').text()

            category_data.append(flight_data)

        data[category.text().lower().replace(' ', '_')] = category_data

    return data
Enter fullscreen mode Exit fullscreen mode

Output

{
  "best_departing_flights": [
    {
      "departure_date": "10:03 PM",
      "arrival_date": "12:42 AM+1",
      "company": "Frontier",
      "duration": "2 hr 39 min",
      "stops": "Nonstop",
      "emissions": "99 kg CO2e",
      "emission_comparison": "-27% emissions",
      "price": "$242",
      "price_type": "round trip",
      "departure_airport": "Seattle–Tacoma International Airport",
      "arrival_airport": "Seattle–Tacoma International Airport"
    },
    {
      "departure_date": "8:05 AM",
      "arrival_date": "10:41 AM",
      "company": "Alaska",
      "duration": "2 hr 36 min",
      "stops": "Nonstop",
      "emissions": "130 kg CO2e",
      "emission_comparison": "Avg emissions",
      "price": "$412",
      "price_type": "round trip",
      "departure_airport": "Seattle–Tacoma International Airport",
      "arrival_airport": "Seattle–Tacoma International Airport"
    },
    {
      "departure_date": "9:10 AM",
      "arrival_date": "11:38 AM",
      "company": "Delta",
      "duration": "2 hr 28 min",
      "stops": "Nonstop",
      "emissions": "139 kg CO2e",
      "emission_comparison": "Avg emissions",
      "price": "$447",
      "price_type": "round trip",
      "departure_airport": "Seattle–Tacoma International Airport",
      "arrival_airport": "Seattle–Tacoma International Airport"
    }
  ],
  "other_departing_flights": [
    {
      "departure_date": "8:04 AM",
      "arrival_date": "6:35 AM+1",
      "company": "Frontier",
      "duration": "22 hr 31 min",
      "stops": "1 stop",
      "emissions": "186 kg CO2e",
      "emission_comparison": "+37% emissions",
      "price": "$329",
      "price_type": "round trip",
      "departure_airport": "Seattle–Tacoma International Airport",
      "arrival_airport": "Seattle–Tacoma International Airport"
    },
    {
      "departure_date": "3:34 PM",
      "arrival_date": "6:35 AM+1",
      "company": "Frontier",
      "duration": "15 hr 1 min",
      "stops": "1 stop",
      "emissions": "185 kg CO2e",
      "emission_comparison": "+36% emissions",
      "price": "$329",
      "price_type": "round trip",
      "departure_airport": "Seattle–Tacoma International Airport",
      "arrival_airport": "Seattle–Tacoma International Airport"
    },
    {
      "departure_date": "11:59 PM",
      "arrival_date": "9:43 AM+1",
      "company": "Separate tickets booked togetherThis trip includes tickets from multiple airlines. Missed connections may be protected by Gotogate. Learn moreSun Country Airlines",
      "duration": "9 hr 44 min",
      "stops": "1 stop",
      "emissions": "353 kg CO2e",
      "emission_comparison": "+160% emissions",
      "price": "$379",
      "price_type": "round trip",
      "departure_airport": "Seattle–Tacoma International Airport",
      "arrival_airport": "Seattle–Tacoma International Airport"
    },
    ... other results
  ]
}
Enter fullscreen mode Exit fullscreen mode

📌Note: Other departing trains are not displayed due to the fact that they are not available for this route.

Top comments (0)