How BeautifulSoup is Used to Web Scrape Movie Database?

  • Determine what data you want to extract from the website.
  • Examine the page
  • Beautiful Soup is a great place to start scraping.
import requests import urllib.request import time from bs4 import BeautifulSoup import numpy as np import pandas as pd from urllib.request import urlopenhtml = urlopen('https://www.themoviedb.org/list/4309') bsObj = BeautifulSoup(html) bsObj.prettify()
movies = bsObj.find('ul', id = "list_page_1") movies = movies.find_all('li') len(movies) for movie in movies: print(movie, len(movie), "\n\n")
movies = [movie for movie in movies if len(movie) > 0]
movie_1 = movies[0] movie_1.img.attrs
{'class': ['poster', 'lazyload', 'fade'], 'data-sizes': 'auto', 'data-src': 'https://image.tmdb.org/t/p/w185_and_h278_bestv2/hnYowHwLq0iUWriAHtiiCWsI2dP.jpg', 'data-srcset': 'https://image.tmdb.org/t/p/w185_and_h278_bestv2/hnYowHwLq0iUWriAHtiiCWsI2dP.jpg 1x, https://image.tmdb.org/t/p/w370_and_h556_bestv2/hnYowHwLq0iUWriAHtiiCWsI2dP.jpg 2x', 'alt': 'Nausicaä of the Valley of the Wind'}'Nausicaä of the Valley of the Wind'
from IPython.display import Image image_url = movie_1.find('img').attrs['data-src'] Image(url= image_url)
a = movie_1.a.attrs for value in a.values(): url = value print(url)
full_url = "https://www.themoviedb.org" + url
int(movie_1.find('div', {'class':'number'}).span.text)
float(movie_1.find_all('span',{'class':'rating'})[1].text)
html = urlopen('https://www.themoviedb.org/list/4309') bsObj = BeautifulSoup(html) #Create 4 lists that contains all the url, movie's name, rank, and rating urls = [] names = [] ranks = [] ratings = [] images = [] for movie in movies: for value in movie.a.attrs.values(): url = value urls.append("https://www.themoviedb.org" +url) names.append(movie.img.attrs['alt']) ranks.append(int(movie.find('div', {'class':'number'}).span.text)) ratings.append(float(movie.find_all('span',{'class':'rating'})[1].text)) images.append(movie.find('img').attrs['data-src'])
url = urlopen("https://www.themoviedb.org/movie/81") soup = BeautifulSoup(url) #find summary soup.find('div', {'class':'overview'}).p.get_text() #find director soup.find('li', {'class':'profile'}).a.get_text() #Find language, runtime, budget, revenue, and genre inf = soup.find('ul', {'class':'releases'}).find_next_siblings() language = inf[0].text runtime = inf[1].text budget = inf[2].text rev = inf[3].text #Find Genre section = soup.find('section',{'class':'genres right_column'}) [li.text for li in section.find_all('li')] # Scrap every page summaries = [] languages = [] runtimes = [] budgets = [] revenues = [] genres = [] directors = [] for url in urls: soup = BeautifulSoup(urlopen(url)) summaries.append(soup.find('div', {'class':'overview'}).p.get_text()) inf = soup.find('ul', {'class':'releases'}).find_next_siblings() languages.append(inf[0].text) runtimes.append(inf[1].text) budgets.append(inf[2].text) revenues.append(inf[3].text) directors.append(soup.find('li', {'class':'profile'}).a.get_text()) section = soup.find('section',{'class':'genres right_column'}) genres.append([li.text for li in section.find_all('li')])import pandas as pd ghibli = pd.DataFrame(list(zip(names, ranks, ratings, languages, runtimes, budgets, revenues, genres, summaries)), columns=['name','rank','rating','language','runtime', 'budget','revenue','genre','summary']) ghibli.head(10)

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
3i Data Scraping

3i Data Scraping

44 Followers

3i Data Scraping is an Experienced Web Scraping Service Provider in the USA. We offering a Complete Range of Data Extraction from Websites and Online Outsource.