How BeautifulSoup is Used to Web Scrape Movie Database?

  • Determine what data you want to extract from the website.
  • Examine the page
  • Beautiful Soup is a great place to start scraping.
import requests import urllib.request import time from bs4 import BeautifulSoup import numpy as np import pandas as pd from urllib.request import urlopenhtml = urlopen('') bsObj = BeautifulSoup(html) bsObj.prettify()
movies = bsObj.find('ul', id = "list_page_1") movies = movies.find_all('li') len(movies) for movie in movies: print(movie, len(movie), "\n\n")
movies = [movie for movie in movies if len(movie) > 0]
movie_1 = movies[0] movie_1.img.attrs
{'class': ['poster', 'lazyload', 'fade'], 'data-sizes': 'auto', 'data-src': '', 'data-srcset': ' 1x, 2x', 'alt': 'Nausicaä of the Valley of the Wind'}'Nausicaä of the Valley of the Wind'
from IPython.display import Image image_url = movie_1.find('img').attrs['data-src'] Image(url= image_url)
a = movie_1.a.attrs for value in a.values(): url = value print(url)
full_url = "" + url
int(movie_1.find('div', {'class':'number'}).span.text)
html = urlopen('') bsObj = BeautifulSoup(html) #Create 4 lists that contains all the url, movie's name, rank, and rating urls = [] names = [] ranks = [] ratings = [] images = [] for movie in movies: for value in movie.a.attrs.values(): url = value urls.append("" +url) names.append(movie.img.attrs['alt']) ranks.append(int(movie.find('div', {'class':'number'}).span.text)) ratings.append(float(movie.find_all('span',{'class':'rating'})[1].text)) images.append(movie.find('img').attrs['data-src'])
url = urlopen("") soup = BeautifulSoup(url) #find summary soup.find('div', {'class':'overview'}).p.get_text() #find director soup.find('li', {'class':'profile'}).a.get_text() #Find language, runtime, budget, revenue, and genre inf = soup.find('ul', {'class':'releases'}).find_next_siblings() language = inf[0].text runtime = inf[1].text budget = inf[2].text rev = inf[3].text #Find Genre section = soup.find('section',{'class':'genres right_column'}) [li.text for li in section.find_all('li')] # Scrap every page summaries = [] languages = [] runtimes = [] budgets = [] revenues = [] genres = [] directors = [] for url in urls: soup = BeautifulSoup(urlopen(url)) summaries.append(soup.find('div', {'class':'overview'}).p.get_text()) inf = soup.find('ul', {'class':'releases'}).find_next_siblings() languages.append(inf[0].text) runtimes.append(inf[1].text) budgets.append(inf[2].text) revenues.append(inf[3].text) directors.append(soup.find('li', {'class':'profile'}).a.get_text()) section = soup.find('section',{'class':'genres right_column'}) genres.append([li.text for li in section.find_all('li')])import pandas as pd ghibli = pd.DataFrame(list(zip(names, ranks, ratings, languages, runtimes, budgets, revenues, genres, summaries)), columns=['name','rank','rating','language','runtime', 'budget','revenue','genre','summary']) ghibli.head(10)



