I'm attempting to make a full stack site with a scraper that runs on a schedule, storing an object in a MongoDB collection periodically. I'm attempting to use Puppeteer and cheerio with node-cron for the scheduled scrape/store. The code works great, but it only works once. It seems Puppeteer can't close itself once its tasks are complete, but I could be very wrong as I'm new at this.
CRON Schedule
let cron = require("node-cron");
let shell = require("shelljs");
cron.schedule("* * * * * *", () => {
console.log("Scheduler executing...");
if(shell.exec("node scraper.js").code !== 0) {
console.log("Scheduler halted!");
}
});
Web Scraper
var { MongoClient } = require('mongodb');
var uri = "MONGODB_URI";
const client = new MongoClient(uri, {useNewUrlParser: true, useUnifiedTopology: true});
const puppeteer = require('puppeteer');
const $ = require('cheerio');
const url = 'SCRAPE_TARGET_URL';
let myObj = {
date: Date(),
one: '',
one-secondary: '',
two: '',
two-secondary: '',
three: '',
three-secondary: ''
}
puppeteer
.launch()
.then(function(browser) {
return browser.newPage();
})
.then(function(page) {
return page.goto(url).then(function() {
return page.content();
});
})
.then(function(html) {
$('#target-one', html).each(function() {
console.log($(this).text());
return myObj.one = $(this).text();
});
$('#target-one-secondary', html).each(function() {
console.log($(this).text());
return myObj.oneSecondary = $(this).text();
});
$('#target-two', html).each(function() {
console.log($(this).text());
return myObj.two = $(this).text();
});
$('#target-two-secondary', html).each(function() {
console.log($(this).text());
return myObj.twoSecondary = $(this).text();
});
$('#target-three', html).each(function() {
console.log($(this).text());
return myObj.three = $(this).text();
});
$('#target-three-secondary', html).each(function() {
console.log($(this).text());
return myObj.threeSecondary = $(this).text();
});
return console.log(myObj);
})
.then(async function() {
await client.connect();
const database = client.db("MY_DB");
const collection = database.collection("MY_COLLECTION");
const doc = myObj;
const result = await collection.insertOne(doc);
console.log(
`${result.insertedCount} documents were inserted with the _id: ${result.insertedId}`,
);
await client.close();
}
);
question from:
https://stackoverflow.com/questions/65835323/puppeteer-node-cron-hangs-after-first-execution 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…