Scrape IMDB movie rating and details using python

#python #web #scraping #beautifulsoup

This article is only for educational purposes

IMDB providing its own api to get the movie details you can use that, this article is a web scraping example.

we will get the movie rating, number of rating, name and many more, for this we use BeautifulSoup and Requests packages

import requests
from bs4 import BeautifulSoup

boys_url = "https://www.imdb.com/title/tt1190634/"

r = requests.get(url=boys_url)
# create a BeautifulSoup object
soup = BeautifulSoup(r.text, 'html.parser')

to get the title of the html page

#page title
title = soup.find('title')
print(title.string)
data["title"] = title.string

soup.find this will look for the title tag and to get the value from it use title.string, this will return the string

Suppose you want to get all the div tags use find_all

soup.find_all('div')

to get a tag with class value

soup.find("div",{'class':'titleBar'})

full code to get the details in python dict, this function will get the html page in get method using requests then it is parsed as BeautifulSoup object and you can extract the data as you want

def getMovieDetails(url):
    data = {}
    r = requests.get(url=url)
    # Create a BeautifulSoup object
    soup = BeautifulSoup(r.text, 'html.parser')

    #page title
    title = soup.find('title')
    data["title"] = title.string

    # rating
    ratingValue = soup.find("span", {"itemprop" : "ratingValue"})
    data["ratingValue"] = ratingValue.string

    # no of rating given
    ratingCount = soup.find("span", {"itemprop" : "ratingCount"})
    data["ratingCount"] = ratingCount.string

    # name
    titleName = soup.find("div",{'class':'titleBar'}).find("h1")
    data["name"] = titleName.contents[0].replace(u'\xa0', u'')

    # additional details
    subtext = soup.find("div",{'class':'subtext'})
    data["subtext"] = ""
    for i in subtext.contents:
        data["subtext"] += i.string.strip()

    # summary
    summary_text = soup.find("div",{'class':'summary_text'})
    data["summary_text"] = summary_text.string.strip()

    credit_summary_item = soup.find_all("div",{'class':'credit_summary_item'})
    data["credits"] = {}
    for i in credit_summary_item:
        item = i.find("h4")
        names = i.find_all("a")
        data["credits"][item.string] = []
        for i in names:
            data["credits"][item.string].append({
                "link": i["href"],
                "name": i.string
            })
    return data

movies
tenet_url = "https://www.imdb.com/title/tt6723592/"
joker_url = "https://www.imdb.com/title/tt7286456/"
series
boys_url = "https://www.imdb.com/title/tt1190634/"
to get the movie details call this function

getMovieDetails(boys_url)

{'title': 'The Boys (TV Series 2019– ) - IMDb',
 'ratingValue': '8.7',
 'ratingCount': '173,133',
 'name': 'The Boys            ',
 'subtext': '18+|1h|Action,Comedy,Crime|TV Series (2019– )',
 'summary_text': 'A group of vigilantes sets out to take down corrupt superheroes who abuse their superpowers.',
 'credits': {'Creator:': [{'link': '/name/nm0471392/', 'name': 'Eric Kripke'}],
  'Stars:': [{'link': '/name/nm0881631/', 'name': 'Karl Urban'},
   {'link': '/name/nm4425051/', 'name': 'Jack Quaid'},
   {'link': '/name/nm1102278/', 'name': 'Antony Starr'},
   {'link': 'fullcredits/', 'name': 'See full cast & crew'}]}}

Additional content
to get all the crew and cast member with their role

import re
def getCrewData(url):
    crew_data = {
        "crew": []
    }
    r = requests.get(url=url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(r.text, 'html.parser')

    #page title
    title = soup.find('title')
    crew_data["title"] = title.string
    cast_list = soup.find("table", {"class" : "cast_list"})

    trows = cast_list.find_all('tr')

    for tr in trows:
        td = tr.find_all('td')
        if len(td)==4:
            row = [i.text for i in td]
            crew_data["crew"].append({
                "name":re.sub("[^a-zA-Z' ]+", '', row[1]).strip(),
                "character":re.sub("[^a-zA-Z' ]+", '', row[3]).strip()
            })
    return crew_data

series
boys_url = "https://www.imdb.com/title/tt1190634/fullcredits/"
Movie
tenet_url = "https://www.imdb.com/title/tt6723592/fullcredits/"

getCrewData(tenet_url)

Conclusion: Web scraping is always not stable when changes made to the web pages it will affects the code logic which is previously build. Not all website allow you to scrape their contents, some use javascript to render the page(build one vue, react js) in that case use selenium to get the page rendered. Use this only for educational use.

DEV Community

Scrape IMDB movie rating and details using python

This article is only for educational purposes

Top comments (0)