How to Extract the Web to Get Data about the Top Rated Movies on TV?

Getting a Good Webpage to Extract

  • has different HTML tags having a clear id or class
  • utilizes ids and classes in a constant way
  • provides well-structured URLs
  • contains all applicable TV channels on single page
  • has a different page every weekday
  • only lists films as well as no other programs like news, live shows, reportage, etc. Except you can easily differentiate films from other program kinds.

Decide Which Data to Store?

  • Film Title
  • TV Channel
  • TMDB Rating
  • The Time When a Film Starts
  • The Date Film is on TV
  • Release Date
  • Plot
  • Link To The Details Page On TMDB
  • Genre

Creating a Scrapy Project

scrapy startproject topfilms

Define Scrapy Items

import scrapy class TVGuideItem(scrapy.Item): title = scrapy.Field() channel = scrapy.Field() start_ts = scrapy.Field() film_date_long = scrapy.Field() film_date_short = scrapy.Field() genre = scrapy.Field() plot = scrapy.Field() rating = scrapy.Field() tmdb_link = scrapy.Field() release_date = scrapy.Field() nb_votes = scrapy.Field()

Process Items using Pipelines

import sqlite3 as lite con = None # db connection class StoreInDBPipeline(object): def __init__(self): self.setupDBCon() self.dropTopFilmsTable() self.createTopFilmsTable() def process_item(self, item, spider): self.storeInDb(item) return item def storeInDb(self, item): self.cur.execute("INSERT INTO topfilms(\ title, \ channel, \ start_ts, \ film_date_long, \ film_date_short, \ rating, \ genre, \ plot, \ tmdb_link, \ release_date, \ nb_votes \ ) \ VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )", ( item['title'], item['channel'], item['start_ts'], item['film_date_long'], item['film_date_short'], float(item['rating']), item['genre'], item['plot'], item['tmdb_link'], item['release_date'], item['nb_votes'] )) self.con.commit() def setupDBCon(self): self.con = lite.connect('topfilms.db') self.cur = self.con.cursor() def __del__(self): self.closeDB() def createTopFilmsTable(self): self.cur.execute("CREATE TABLE IF NOT EXISTS topfilms(id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \ title TEXT, \ channel TEXT, \ start_ts TEXT, \ film_date_long TEXT, \ film_date_short TEXT, \ rating TEXT, \ genre TEXT, \ plot TEXT, \ tmdb_link TEXT, \ release_date TEXT, \ nb_votes \ )") def dropTopFilmsTable(self): self.cur.execute("DROP TABLE IF EXISTS topfilms") def closeDB(self): self.con.close()

Create a Class for Storing Items in a Database

class StoreInDBPipeline(object):

Define a Constructor Method

def __init__(self): self.setupDBCon() self.dropTopFilmsTable() self.createTopFilmsTable()

SetupDBCon Method

def setupDBCon(self): self.con = lite.connect('topfilms.db') self.cur = self.con.cursor()

DropTopFilmsTable Method

def dropTopFilmsTable(self): self.cur.execute("DROP TABLE IF EXISTS topfilms")

CreateTopFilmsTable Method

def createTopFilmsTable(self): self.cur.execute("CREATE TABLE IF NOT EXISTS topfilms(id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \ title TEXT, \ channel TEXT, \ start_ts TEXT, \ film_date_long TEXT, \ film_date_short TEXT, \ rating TEXT, \ genre TEXT, \ plot TEXT, \ tmdb_link TEXT, \ release_date TEXT, \ nb_votes \ )")

Process_item Method

def process_item(self, item, spider): self.storeInDb(item) return item

StoreInDb Method

def storeInDb(self, item): self.cur.execute("INSERT INTO topfilms(\ title, \ channel, \ start_ts, \ film_date_long, \ film_date_short, \ rating, \ genre, \ plot, \ tmdb_link, \ release_date, \ nb_votes \ ) \ VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )", ( item['title'], item['channel'], item['start_ts'], item['film_date_long'], item['film_date_short'], float(item['rating']), item['genre'], item['plot'], item['tmdb_link'], item['release_date'], item['nb_votes'] )) self.con.commit()

Every Constructor Comes with a Destructor!

def __del__(self): self.closeDB()

CloseDB Method

def closeDB(self): self.con.close()

Allowing a Pipeline in settings.py

ITEM_PIPELINES = { 'topfilms.pipelines.StoreInDBPipeline':1 }

Making a Spider in Scrapy

Importing the Required Packages

import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from fuzzywuzzy import fuzz from ..config import * from topfilms.items import TVGuideItem

Instructing a Spider about Where to Go

class TVGuideSpider(CrawlSpider): name = "tvguide" allowed_domains = [DOM_1, DOM_2] start_urls = [START_URL] # Extract the links from the navigation per day # We will not crawl the films for yesterday rules = ( Rule(LinkExtractor(allow=(), deny=(r'\/gisteren'), restrict_xpaths=('//a[@class="button button--beta"]',)), callback="parse_by_day", follow= True), )

Parse the Followed URLs

def parse_by_day(self, response): film_date_long = response.xpath('//div[@class="grid__col__inner"]/p/text()').extract_first() film_date_long = film_date_long.rsplit(',',1)[-1].strip() # Remove day name and white spaces # Create a film date with a short format like YYYYMMDD to sort the results chronologically film_day_parts = film_date_long.split() months_list = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december' ] year = str(film_day_parts[2]) month = str(months_list.index(film_day_parts[1]) + 1).zfill(2) day = str(film_day_parts[0]).zfill(2) film_date_short = year + month + day for col_inner in response.xpath('//div[@class="grid__col__inner"]'): chnl = col_inner.xpath('.//div[@class="tv-guide__channel"]/h6/a/text()').extract_first() if chnl in ALLOWED_CHANNELS: for program in col_inner.xpath('.//div[@class="program"]'): item = TVGuideItem() item['channel'] = chnl item['title'] = program.xpath('.//div[@class="title"]/a/text()').extract_first() item['start_ts'] = program.xpath('.//div[@class="time"]/text()').extract_first() item['film_date_long'] = film_date_long item['film_date_short'] = film_date_short detail_link = program.xpath('.//div[@class="title"]/a/@href').extract_first() url_part = detail_link.rsplit('/',1)[-1] # Extract information from the Movie Database www.themoviedb.org request = scrapy.Request("https://www.themoviedb.org/search?query="+url_part,callback=self.parse_tmdb) request.meta['item'] = item # Pass the item with the request to the detail page yield request

Scrape Additional Data on The Movie Database

from fuzzywuzzy import fuzz
def parse_tmdb(self, response): item = response.meta['item'] # Use the passed item tmdb_titles = response.xpath('//a[@class="title result"]/text()').extract() if tmdb_titles: # Check if there are results on TMDB for tmdb_title in tmdb_titles: match_ratio = fuzz.ratio(item['title'], tmdb_title) if match_ratio > 90: item['genre'] = response.xpath('.//span[@class="genres"]/text()').extract_first() item['rating'] = response.xpath('//span[@class="vote_average"]/text()').extract_first() release_date = response.xpath('.//span[@class="release_date"]/text()').extract_first() release_date_parts = release_date.split('/') item['release_date'] = "/".join( [release_date_parts[1].strip(), release_date_parts[0].strip(), release_date_parts[2].strip()]) tmdb_link = "https://www.themoviedb.org" + response.xpath( '//a[@class="title result"]/@href').extract_first() item['tmdb_link'] = tmdb_link # Extract more info from the detail page request = scrapy.Request(tmdb_link, callback=self.parse_tmdb_detail) request.meta['item'] = item # Pass the item with the request to the detail page yield request break # We only consider the first match else: return

Scrape a Film Plot from Details Page

def parse_tmdb_detail(self, response): item = response.meta['item'] # Use the passed item item['nb_votes'] = response.xpath('//span[@itemprop="ratingCount"]/text()').extract_first() item['plot'] = response.xpath('.//p[@id="overview"]/text()').extract_first() yield item

Use of Extensions in Scrapy

Import of Required Packages

import logging from scrapy import signals from scrapy.exceptions import NotConfigured import smtplib import sqlite3 as lite from config import *

Create a SendEmail Class in Extensions

logger = logging.getLogger(__name__) class SendEmail(object): def __init__(self): self.fromaddr = FROMADDR self.toaddr = TOADDR

Instantiating an Extension Object

MYEXT_ENABLED = True EXTENSIONS = { 'topfilms.extensions.SendEmail': 500, 'scrapy.telnet.TelnetConsole': None }
@classmethod def from_crawler(cls, crawler): # first check if the extension should be enabled and raise # NotConfigured otherwise if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured # instantiate the extension object ext = cls() # connect the extension object to signals crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) # return the extension object return ext

Define Actions in a spider_opened Event

def spider_opened(self, spider): logger.info("opened spider %s", spider.name)

Send an Email After spider_closed event

def spider_closed(self, spider): logger.info("closed spider %s", spider.name) # Getting films with a rating above a threshold topfilms_overview = "" con = lite.connect('topfilms.db') cur = con.execute( "SELECT title, channel, start_ts, film_date_long, plot, genre, release_date, rating, tmdb_link, nb_votes " "FROM topfilms " "WHERE rating >= 6.5 " "ORDER BY film_date_short, start_ts") data = cur.fetchall() if len(data) > 0: # Check if we have records in the query result for row in data: title = row[0].encode('ascii', 'ignore') channel = row[1] start_ts = row[2] film_date_long = row[3] plot = row[4].encode('ascii', 'ignore') genre = row[5] release_date = row[6].rstrip() rating = row[7] tmdb_link = row[8] nb_votes = row[9] topfilm = ' - '.join([title, channel, film_date_long, start_ts]) topfilm = topfilm + "\r\n" + "Release date: " + release_date topfilm = topfilm + "\r\n" + "Genre: " + str(genre) topfilm = topfilm + "\r\n" + "TMDB rating: " + rating + " from " + nb_votes + " votes" topfilm = topfilm + "\r\n" + plot topfilm = topfilm + "\r\n" + "More info on: " + tmdb_link topfilms_overview = "\r\n\r\n".join([topfilms_overview, topfilm]) con.close() if len(topfilms_overview) > 0: message = topfilms_overview else: message = "There are no top rated films for the coming week." msg = "\r\n".join([ "From: " + self.fromaddr, "To: " + self.toaddr, "Subject: Top Films Overview", message ]) username = UNAME password = PW server = smtplib.SMTP(GMAIL) server.ehlo() server.starttls() server.login(username, password) server.sendmail(self.fromaddr, self.toaddr, msg) server.quit()

Results of Sending Emails through Extensions

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
3i Data Scraping

3i Data Scraping

44 Followers

3i Data Scraping is an Experienced Web Scraping Service Provider in the USA. We offering a Complete Range of Data Extraction from Websites and Online Outsource.