DEV Community

loading...
Cover image for Scrape Baidu Organic Search with Python

Scrape Baidu Organic Search with Python

Dimitry Zub
Python Web Scraping
・4 min read

Contents: intro, imports, organic result, answer box, related images, differences, links, outro.

Intro

This blog post is a collection of examples on how to scrape certain Baidu Search Results using Python as well as using an alternative solution SerpApi that you can stack on top of each other to suits your particular needs.

Imports

from bs4 import BeautifulSoup
import requests, lxml, json
from serpapi import BaiduSearch # only for SerpApi solution
import os # only used with SerpApi to create environment for API_KEY
Enter fullscreen mode Exit fullscreen mode

Organic Results

from bs4 import BeautifulSoup
import requests, lxml, json

headers = {
    "User-Agent":
    "Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140"
}


def get_organic_results():
    html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=minecraft',headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')

    baidu_data = []

    for result in soup.select('.result.c-container.new-pmd'):
      title = result.select_one('.t').text
      link = result.select_one('.t').a['href']
      displayed_link = result.select_one('.c-showurl').text
      snippet = result.select_one('.c-abstract').text
      try:
        sitelink_title = result.select_one('.op-se-listen-recommend').text
      except:
        sitelink_title = None
      try:
        sitelink_link = result.select_one('.op-se-listen-recommend')['herf']
      except:
        sitelink_link = None

      baidu_data.append({
        'title': title,
        'link': link,
        'displayed_link': displayed_link,
        'snippet': snippet,
        'sitelinks': {'title': sitelink_title, 'link': sitelink_link},
      })

    print(json.dumps(baidu_data, indent=2, ensure_ascii=False))

# Part of the output:
'''
[
  {
    "title": "minecraft website - 官方网站 | Minecraft",
    "link": "http://www.baidu.com/link?url=_XTFGPU6ibzEJnDEdC4y2_WnTCHh-xaHkiR06lAOA6a",
    "displayed_link": "minecraft.net/",
    "snippet": "2021年3月3日 我的世界是一款堆方块、不断冒险的游戏。在此购买,或浏览网站了解最新消息和社区的精彩创意!",
    "sitelinks": {
      "title": null,
      "link": null
    }
  }
]
'''
Enter fullscreen mode Exit fullscreen mode

Using Baidu Organic Search Results API

import os, json
from serpapi import BaiduSearch

def get_organic_results():
    params = {
        "engine": "baidu",
        "q": "minecraft",
        "api_key": os.getenv("API_KEY"),
    }

    search = BaiduSearch(params)
    results = search.get_dict()

    baidu_data = []

    for result in results['organic_results']:
      title = result['title']
      link = result['link']
      try:
        displayed_link = result['displayed_link']
      except:
        displayed_link = None
      try:
        snippet = result['snippet']
      except:
        snippet = None
      try:
        sitelink_title = result['rich_snippet']['sitelinks']['title']
      except:
        sitelink_title = None
      try:
        sitelink_link = result['rich_snippet']['sitelinks']['link']
      except:
        sitelink_link = None

      baidu_data.append({
        'title': title,
        'link': link,
        'displayed_link': displayed_link,
        'snippet': snippet,
        'sitelinks': [{'title': sitelink_title, 'link':sitelink_link}],
      })

    print(json.dumps(baidu_data, indent=2, ensure_ascii=False))

# Part of the output:
'''
[
  {
    "title": "minecraft website - 官方网站 | Minecraft",
    "link": "http://www.baidu.com/link?url=OD7rfRPzLty76yZJ9dimCAV2VS-QyXURXbLmjXH3wq3",
    "displayed_link": "minecraft.net/",
    "snippet": "我的世界是一款堆方块、不断冒险的游戏。在此购买,或浏览网站了解最新消息和社区的精彩创意!",
    "sitelinks": [
      {
        "title": null,
        "link": null
      }
    ]
  }
]
'''
Enter fullscreen mode Exit fullscreen mode

Answer Box

from bs4 import BeautifulSoup
import requests, lxml, re, json

headers = {
    "User-Agent":
    "Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140"
}


def get_answerbox_result():
    html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=jet li',headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')

    try:
      answer_box = []

      for result in soup.find_all('div', class_='c-border'):
        english_word = result.select_one('.op_dict3_marginRight').text
        # british
        british_phonetic = result.select_one('.c-color-t+ td .op_dict3_gap_small').text
        british_chinese_character = result.select_one('.c-color-t+ td .op_dict3_font14').text
        british_audio_link = result.find('a', class_='op_dict3_how_read c-gap-right-middle')['url']
        # american
        american_phonetic = result.select_one('.c-color-t~ td+ td .op_dict3_gap_small').text
        american_chinese_character = result.select_one('.c-color-t~ td+ td .op_dict3_font14').text
        american_audio_link = result.find('a', class_='op_dict3_how_read c-gap-right-middle')['url']

        defenition_notfixed = result.select_one('.c-gap-bottom-xsmall+ .op_dict3_english_result_table .op_dict_text2').text
        # removing all whitespace characters with regex since in not fixed variable they're all over the place.
        # replace('\n', '') or strip() methods doesn't helped
        defenition_fixed = re.sub(r'\s+', '', defenition_notfixed)

        answer_box.append({
          'english_word': english_word,
          'british': {'phonetic': british_phonetic, 'chinese_character': british_chinese_character, 'audio_link': british_audio_link},
          'american': {'phonetic': american_phonetic, 'chinese_character': american_chinese_character, 'audio_link': american_audio_link},
          'defenition': defenition_fixed,
        })

      print(json.dumps(answer_box, indent=2, ensure_ascii=False))

    except:
      print('No answer box found')

# Output:
'''
[
  {
    "english_word": "coffee",
    "british": {
      "phonetic": "[ˈkɒfi]",
      "chinese_character": "英",
      "audio_link": "https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading"
    },
    "american": {
      "phonetic": "[ˈkɔːfi]",
      "chinese_character": "美",
      "audio_link": "https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading"
    },
    "defenition": "(烘烤过的)咖啡豆;咖啡粉;咖啡(热饮料);一杯咖啡;"
  }
]
'''
Enter fullscreen mode Exit fullscreen mode

Using SerpApi Answer box

import os, json
from serpapi import BaiduSearch


def get_answerbox_result():
    params = {
        "engine": "baidu",
        "q": "coffee",
        "api_key": os.getenv("API_KEY"),
    }

    search = BaiduSearch(params)
    results = search.get_dict()

    for result in results['answer_box']:
      title = result['title']
      link = result['link']
      displayed_link = result['displayed_link']
      english_word = result['english_word']
      british = result['british']
      american = result['american']
      defenitions = result['definitions'][0] # array output

      print(f'{title}\n{link}\n{displayed_link}\n{english_word}\n{british}\n{american}\n{defenitions}')

# Output:
'''
coffee - 百度翻译
http://www.baidu.com/link?url=JA5gottCkKOdztdz_enXoECH2LfUZwlDRs-ll_E7fa6TXpjY6hQzf1GzPU7gTxHkOTOTFpSm6g_6OlvRNqjjP_
fanyi.baidu.com
coffee
{'phonetic': '[ˈkɒfi]', 'chinese_character': '英', 'audio_link': 'https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading'}
{'phonetic': '[ˈkɔːfi]', 'chinese_character': '美', 'audio_link': 'https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=en&text=coffee&spd=2&source=alading'}
['n. (烘烤过的)咖啡豆; 咖啡粉; 咖啡(热饮料); 一杯咖啡;']
'''
Enter fullscreen mode Exit fullscreen mode

Related Images

from bs4 import BeautifulSoup
import requests, lxml, re, json

headers = {
    "User-Agent":
    "Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140"
}


def get_related_images_result():
    html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=jet li',headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')

    for related_image in soup.find_all('div', class_='op-img-address-divide-high'):
      link = related_image.a['href']
      image = related_image.a.img['src']
      print(f'{link}\n{image}\n')

# part of output:
'''
http://www.baidu.com/link?url=eSWgjIjvJxs9ihAwqPFMk0w0oeXDbwJ7Hi4mYNJzirGQ1ssl8BuLkI7GhtPPou-J2tYlh7CaMQhGC8uAStmiI7Kx2Ln8mNBobjTQ8J8elSeHIHbKy2UKJPMNB8Jv8C6JxzjRlSeOVeGhmGqg0HvT69706LMw5k7KX5V4aKLgkfTrDjYLwG1b9wRG_n4G752-MLNP_u0rJLwS0PGKAdIctA-oStoNf8efPJZmkExIpA6GZQ1-T0YyA445E9uAtWldweZwOFrZ5H-KzkT5xKW3e33kFyGrQV5Rb_li6YZ6VZ8M4K3ESwO6tzEex_eZxq_xrhRGddDw1LHTn1NmXqvsrkCEpPze5oAtsXNEaSMnSENi3q_qpTucgaWN8eDYk4ciQr42JVuv1cgrHKSf4_0dNwBhiAQB8uj6UIJFDZ-tFAIX1O2ZWQGhoBgpVm7DjVIVoVVraQx9PwZVTq80P3DhhH91U6QkSh4y1LmZJxHZVnRQ-_pZUJKircxw9ofSrgwSWNxkYo6NXwwn9ys9ggz12PHJo5IvjJRGFIlaEm1ZZHfuSfEusdI71L9RQWuSrWpxJiMqS-oqe_pSNgYxPD1PK_
https://dss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2262600612,448000708&fm=26&gp=0.jpg
'''
Enter fullscreen mode Exit fullscreen mode

Using SerpApi Related Images

import os, json
from serpapi import BaiduSearch


def get_related_images_result():
    params = {
        "engine": "baidu",
        "q": "jet li",
        "api_key": os.getenv("API_KEY"),
    }

    search = BaiduSearch(params)
    results = search.get_dict()

    for result in results['organic_results']:
      try:
        images = result['related_images']
      except:
        images = None
      print(images)

# part of the output:
'''
[{'link': 'http://www.baidu.com/link?url=YQnuO4njMj88UErWJBkGuS4aGdNiv9ZVtySw5fqiVpRTwmgJFEm_ZhCw9Zbc7U1C3Red20zd6N-FzwpURm5jDcnUsp34rhTHApNvnHuB3DlhwIu7-4BwuzlITjhSrXr0DgMBZGNt3UhgGNVTrybeZ6IPGD8Ej_oqSASrusItTQiAVlW-khcZ0A8Q1oWo6Dea_9u1gigFS30GAwBJGz4RdrnFmcyAo7AshuflPdptpcLWqx5TTYF0WjjQVVULBSRmETaEfEGIuO_YMoOKqGoc9d9d9o9QUmRClayPSf5xTppjPGYQGZmUDJ-93grTkqry63e4nXW460Lf-8ctZfnV36UTpWm-hmhXHw7pjATVT88Rmvbxo_hVLyH0dUNdapqsqTdl6YBYFA4k1JjmR5ibhDHd5tH1QuBc5XJVoG1HL-dxNjU_a3NecDeejZstG9zAr59ESZli63E8tgX1THSJ0xeY9G9VOZI-dx79kSg0pUyzctaux8jHWlh48D7qcg5sJCDh_V33kOnhTp9pbJqI3DR4r05Ma_WowxYUV87-pkMxmSnPXtK8Av6lCQgvz7tAFSmzLoPWmz5Fd_cSJ_yB7a', 'image': 'https://dss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2262600612,448000708&fm=26&gp=0.jpg'}]
'''
Enter fullscreen mode Exit fullscreen mode

Differences between API and non-API solution

  • fast solution and straightforward exploration.
  • don't have to figure out how to grab certain elements.
  • don't have to maintain the parser if things are changed in the HTML code.

Links

Code in the online IDE.
Documentation: Baidu Organic Search Results API.

Outro

If you want to see how to scrape something specific, please, write me a message. If you want to see some projects made with SerpApi, please, write me a message.

Yours, D.

Discussion (0)