I'm trying to get into web-scraping more deeply and I run into issues scarping a site, that seems to detect my attempts to scrape it, even though I use 100% the same headers as a real browser does.
I tried it with three different libraries and failed: request, node-fetch and axios.
node-fetch code:
function search() {
var bench = new Date().getTime();
var minRefresh = 45000;
var maxRefresh = 55000;
var refresh = minRefresh+Math.round(Math.random()*(maxRefresh-minRefresh));
(async () => {
const response = await fetch("https://www.computeruniverse.net/de/page/nvidia-geforce-rtx-3060-ti", {
"headers": {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "de-DE,de;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
},
"referrerPolicy": "strict-origin-when-cross-origin",
"body": null,
"method": "GET",
"mode": "cors",
"credentials": "omit"
});
let data = await response.text();
console.log(data);
})();
}
search();
Axios code:
const cheerio = require('cheerio');
const axios = require('axios');
axios.defaults.withCredentials = true;
//Main
function search() {
var url = "https://www.computeruniverse.net/de/page/nvidia-geforce-rtx-3060-ti";
axios.get(url, {
headers: {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7,la;q=0.6",
"cache-control": "no-cache",
"pragma": "no-cache",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
},
})
.then(function (res) {
// handle success
console.log(res.data);
const $ = cheerio.load(res.data);
})
.catch(function (error) {
// handle error
console.log(error);
})
.then(function () {
// always executed
});
}
//Execute
setTimeout(function() {
search();
}, 500);
I would really try to understand why it is not working and what I can do to make it work. I know it works easily with puppeteer, but I want to avoid using it for performance/resource issues.
Is there a way to get it done with those (or similar) simple request libraries?
Would love to get some feedback/help from an experienced scraper/bot writer.
question from:
https://stackoverflow.com/questions/65866153/sending-requests-with-fetch-axios-and-avoid-being-detected-as-bot-nodejs