Make a Web Scraper with AWS Lambda and the Serverless Framework

  • Node.js and modern JavaScript
  • NPM
  • The Document Object Model
  • Basic Linux command line
  • Basic donkey care

Why Use Scraper?

Step 1: Serverless Setup

$ serverless create --template aws-nodejs --path donkeyjob$ cd donkeyjob
service: donkeyjobprovider: name: aws runtime: nodejs6.10functions: getdonkeyjobs: handler: handler.getdonkeyjobs
module.exports.getdonkeyjobs = (event, context, callback) => { callback(null, 'Hello world'); };
$ serverless invoke local --function getdonkeyjobs

Step 2: Scraping The Data

[ {job: 'Marketing Campaigns Officer', closing: 'Fri Jul 21 2017 00:00:00 GMT+0100', location: 'Leeds, UK'}, {job: 'Registered Veterinary Nurse', closing: 'Sat Jul 22 2017 00:00:00 GMT+0100', location: 'Manchester, UK'}, {job: 'Building Services Manager', closing: 'Fri Jul 21 2017 00:00:00 GMT+0100', location: 'London, UK'} ];
const request = require('axios'); const {extractListingsFromHTML} = require('./helpers'); module.exports.getdonkeyjobs = (event, context, callback) => { request('https://www.thedonkeysanctuary.org.uk/vacancies') .then(({data}) => { const jobs = extractListingsFromHTML(data); callback(null, {jobs}); }) .catch(callback); }; const cheerio = require('cheerio'); const moment = require('moment'); function extractListingsFromHTML (html) { const $ = cheerio.load(html); const vacancyRows = $('.view-Vacancies tbody tr'); const vacancies = []; vacancyRows.each((i, el) => { // Extract information from each row of the jobs table let closing = $(el).children('.views-field-field-vacancy-deadline').first().text().trim(); let job = $(el).children('.views-field-title').first().text().trim(); let location = $(el).children('.views-field-name').text().trim(); closing = moment(closing.slice(0, closing.indexOf('-') - 1), 'DD/MM/YYYY').toISOString(); vacancies.push({closing, job, location}); }); return vacancies; } module.exports = { extractListingsFromHTML };
$ serverless invoke local --function getdonkeyjobs

Step 3: Setup DynamoDB

service: donkeyjob provider: name: aws runtime: nodejs6.10 functions: getdonkeyjobs: handler: handler.getdonkeyjobs resources: Resources: donkeyjobs: Type: AWS::DynamoDB::Table Properties: TableName: donkeyjobs AttributeDefinitions: - AttributeName: listingId AttributeType: S KeySchema: - AttributeName: listingId KeyType: HASH ProvisionedThroughput: ReadCapacityUnits: 1 WriteCapacityUnits: 1 # A policy is a resource that states one or more permssions. It lists actions, resources and effects. DynamoDBIamPolicy: Type: AWS::IAM::Policy DependsOn: donkeyjobs Properties: PolicyName: lambda-dynamodb PolicyDocument: Version: '2012-10-17' Statement: - Effect: Allow Action: - dynamodb:DescribeTable - dynamodb:Query - dynamodb:Scan - dynamodb:GetItem - dynamodb:PutItem - dynamodb:UpdateItem - dynamodb:DeleteItem Resource: arn:aws:dynamodb:*:*:table/donkeyjobs Roles: - Ref: IamRoleLambdaExecution

Step 4: Interact with DynamoDB

{ jobs: [ {job: 'Donkey Feeder', closing: 'Fri Jul 21 2017 00:00:00 GMT+0100', location: 'Leeds, UK'}, {job: 'Chef', closing: 'Fri Jul 21 2017 00:00:00 GMT+0100', location: 'Sheffield, UK'} ], listingId: 'Fri Jul 21 2017 14:25:35 GMT+0100 (BST)' }
const request = require('axios'); const AWS = require('aws-sdk'); const dynamo = new AWS.DynamoDB.DocumentClient(); const { differenceWith, isEqual } = require('lodash'); const { extractListingsFromHTML } = require('./helpers'); module.exports.getdonkeyjobs = (event, context, callback) => { let newJobs, allJobs; request('https://www.thedonkeysanctuary.org.uk/vacancies') .then(({ data }) => { allJobs = extractListingsFromHTML(data); // Retrieve yesterday's jobs return dynamo.scan({ TableName: 'donkeyjobs' }).promise(); }) .then(response => { // Figure out which jobs are new let yesterdaysJobs = response.Items[0] ? response.Items[0].jobs : []; newJobs = differenceWith(allJobs, yesterdaysJobs, isEqual); // Get the ID of yesterday's jobs which can now be deleted const jobsToDelete = response.Items[0] ? response.Items[0].listingId : null; // Delete old jobs if (jobsToDelete) { return dynamo.delete({ TableName: 'donkeyjobs', Key: { listingId: jobsToDelete } }).promise(); } else return; }) .then(() => { // Save the list of today's jobs return dynamo.put({ TableName: 'donkeyjobs', Item: { listingId: new Date().toString(), jobs: allJobs } }).promise(); }) .then(() => { callback(null, { jobs: newJobs }); }) .catch(callback); };
$ serverless invoke local --function getdonkeyjobs

Step 5: Sending a Text Using Nexmo

.then(() => { if (newJobs.length) { var nexmo = new Nexmo({ apiKey: NEXMO_API_KEY, apiSecret: NEXMO_API_SECRET }); nexmo.message.sendSms('Donkey Jobs Finder', MY_PHONE_NUMBER, 'Hello, we found a new donkey job!'); } callback(null, { jobs: newJobs }); })
function formatJobs (list) { return list.reduce((acc, job) => { return `${acc}${job.job} in ${job.location} closing on ${moment(job.closing).format('LL')}\n\n`; }, 'We found:\n\n'); } module.exports = { extractListingsFromHTML, formatJobs };

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
3i Data Scraping

3i Data Scraping

44 Followers

3i Data Scraping is an Experienced Web Scraping Service Provider in the USA. We offering a Complete Range of Data Extraction from Websites and Online Outsource.