温馨提示:本文翻译自stackoverflow.com,查看原文请点击:javascript - Why data scraping with puppeteer always gives data from first page?
express javascript node.js puppeteer

javascript - 为什么用puppeteer抓取数据总是从首页提供数据?

发布于 2020-04-17 14:44:33

我正在尝试从带有伪造者的网站上抓取数据。每次我要求数据时,即使我正在传递其他任何页面的网址,它也会从首页提供数据。在谷歌浏览器上,它为我提供了与搜索到的网址相关的正确页面数据,但根据我从API或 Postman 的要求,它始终为我提供第一页数据。以下是我的脚本...

async function main() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();
    await page.setViewport({ width: 1200, height: 720 })
    await page.goto('https://member.daraz.pk/user/login', { waitUntil: 'networkidle0' }); // wait until page load
    await page.type('input[type="text"]', 'username', { delay: 10 });
    await page.type('input[type="password"]', 'pass', { delay: 10 });

    // click and wait for navigation

    await page.click('.next-btn-large');
    await page.waitFor(8000);
    const page1 = await browser.newPage();
    await page1.setViewport({ width: 1200, height: 720 })
    await page.waitFor(1000);
    for (let i = 1; i < 10; i++) {
        await page.goto(`https://www.daraz.pk/air-conditioners/gree/?page=${i}`, { waitUntil: 'networkidle0' });

        // always return first page data

    }

}

main();```

查看更多

提问者
Md Ch
被浏览
61
jfriend00 2020-02-06 15:17

我在评论中建议的脚本正在加载图像src值,并要求这些图像在页面加载之前可见。因此,如果没有显示正确的选项卡,则可能不会加载它们。这是页面内置的某种按需图像加载。最好查看未以这种方式加载的页面的其他方面。我修改了脚本来做到这一点。

这是一个对我有用的脚本。我不知道您要从页面中获取哪些数据,但这会获取页面中每个产品sku-simple价值和价值title为简便起见,我仅将控制台中每页的前10个产品输出到控制台,并将其回拨到仅遍历3页。您显然可以根据需要进行调整。我还从脚本中删除了用户名/密码,因为我发现您不再将其公开。您可以自己填写。

const puppeteer = require('puppeteer');

async function main() {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();
    await page.setViewport({ width: 1200, height: 720 })
    await page.goto('https://member.daraz.pk/user/login', { waitUntil: 'networkidle0' }); // wait until page load
    await page.type('input[type="text"]', 'xxx', { delay: 10 });
    await page.type('input[type="password"]', 'yyy', { delay: 10 });

    // click and wait for navigation

    await page.click('.next-btn-large');
    await page.waitFor(8000);
    const page1 = await browser.newPage();
    await page1.setViewport({ width: 1200, height: 720 })
    await page.waitFor(1000);
    // page.on('console', msg => console.log('PAGE LOG:', msg.text()));
    for (let i = 1; i <= 3; i++) {
        await page.goto(`https://www.daraz.pk/air-conditioners/gree/?page=${i}`, { waitUntil: 'networkidle0' });
        let srcs = await page.$$eval(".c2prKC", elements => { 
            return elements.map(el => {
                let skuSimple = el.getAttribute("data-sku-simple");
                let link = el.querySelector(".c16H9d a");
                let title = "<unknown>";
                if (link) {
                    title = link.getAttribute("title");
                }
                return {skuSimple, title};
            });
        });
        console.log(`Data for page ${i}:`);
        console.log(srcs.slice(0,10));
    }
    //await browser.close();    

}

main();

我在控制台中看到了这样的输出,因此它肯定是在获取页面并从这些页面中的DOM检索数据:

Data for page 1:
[
  {
    skuSimple: 'GR678HL0KV5HWNAFAMZ-4744951',
    title: 'Gree Inverter AC - GS-18CITH12G - 1.5 ton - Inverter  Air Conditioner - Cozy Series - Heat N Cool - Grey'
  },
  {
    skuSimple: 'GR678HL09YUCKNAFAMZ-3940302',
    title: 'Gree GS-12FITH1W - Fairy Inverter Air Conditioner Series - White'
  },
  {
    skuSimple: 'GR678HL0RTUHWNAFAMZ-3940305',
    title: 'Gree GS-18FITH1W - Fairy Inverter Air Conditioner Series - White'
  },
  {
    skuSimple: 'GR678HL1E0WZSNAFAMZ-1741958',
    title: 'Gree Split Air Conditioner - GS-12LM4 - 1 Ton - White'
  },
  {
    skuSimple: '2779851_PK-1252862621',
    title: 'Gree 18CITHI 12G- DC Inverter AC - 1.5 Ton'
  },
  {
    skuSimple: 'GR678HLEOKNJNAFAMZ-668566',
    title: 'Gree Gree GS-12LM -1 Ton Air Conditioner - White'
  },
  {
    skuSimple: '114820460_PK-1266640670',
    title: 'Gree Windows AC 0.75 Ton with Remote Control 60% Electricity Saving'
  },
  {
    skuSimple: '2864384_PK-1246026961',
    title: 'Gree Inverter AC - GS-12CITH12G - 1.0ton - Inverter Air Conditioner - Cozy Series - Heat N Cool - Grey'
  },
  {
    skuSimple: '105610333_PK-1253012621',
    title: 'Gree 1.0 Ton Dc Inverter AC Heat & Cool R-410A Air Conditioner - 12cith12G - Grey'
  },
  {
    skuSimple: '105616318_PK-1253002672',
    title: 'Gree 1.5 Ton Dc Inverter AC Heat & Cool R-410A Air Conditioner - 18cith12G - Grey'
  }
]
Data for page 2:
[
  {
    skuSimple: '109636918_PK-1260070281',
    title: 'New Gree DC Inverter Ac 1(ton) 12CIT'
  },
  {
    skuSimple: '114536248_PK-1266322653',
    title: 'Gree 1.0 Ton Heat & Cool DC Inverter Air conditioner 12CITH'
  },
  {
    skuSimple: '109830097_PK-1260278793',
    title: 'AC Dawlance Inspire Plus Inverter 30 1.5 Ton Split Saving 26000 Yearly'
  },
  {
    skuSimple: '121648880_PK-1277580612',
    title: 'Gs-24Lm4L - 2 Ton Ac - White - Brand Warranty'
  },
  {
    skuSimple: '106364064_PK-1254400160',
    title: 'Gree Floor Standing GF-48FW - Floor Standing Low Voltage Startup Series - White'
  },
  {
    skuSimple: '109324039_PK-1259442545',
    title: 'Gree G10 Inverter 1.5 Ton (18000 BTU) GS-18CITH2/2G Split Air Conditioner'
  },
  {
    skuSimple: '122056481_PK-1278142392',
    title: 'AC Gree 12FITH1C 1 Ton DC Inverter Split AC 50% to 70% Energy Saving'
  },
  {
    skuSimple: '115570453_PK-1267506144',
    title: 'AC Gree GS-12CITH13M Inverter 1 Ton (Wifi) Split 60% to 70% Energy Saving'
  },
  {
    skuSimple: 'GR678HL0ZWE2CNAFAMZ-4776611',
    title: 'Gree 1.5 Ton Dc Inverter Heat & Cool R-410A Air Conditioner - 18cith11B - Black'
  },
  {
    skuSimple: '110096660_PK-1260802813',
    title: 'GREE 1.0 TON SPLIT COOL ONLY AIR CONDITIONER 12LM4'
  }
]
Data for page 3:
[
  {
    skuSimple: 'GR678HL017DY0NAFAMZ-4102700',
    title: 'Gree 1.5 Ton Dc Inverter Heat & Cool R-410A Air Conditioner - 18cith11S - Silver'
  },
  {
    skuSimple: '115554341_PK-1267490372',
    title: 'Gree GS-18CITH13M Inverter 1.5 Ton (Wifi) Split Up to 60% Energy Saving'
  },
  {
    skuSimple: '109428468_PK-1259596998',
    title: 'Gree Inverter Air conditioner 2 ton'
  },
  {
    skuSimple: '124818788_PK-1282694870',
    title: 'Gree Inverter Air Conditioner - GS-24CITH11W - Cozy Inverter Series - 02ton - White'
  },
  {
    skuSimple: '3407444_PK-1247135008',
    title: 'Gree 2 Ton Dc Inverter Heat & Cool R-410A Air Conditioner - 24cith11S - Silver'
  },
  {
    skuSimple: '109826799_PK-1260322442',
    title: 'Gree GS-18CITH13M Inverter 1.5 Ton (Wifi) Split Up to 60% Energy Saving'
  },
  {
    skuSimple: '130883483_PK-1290780443',
    title: 'Gree - Inverter Split Air Conditioner - 1.5 Ton'
  },
  {
    skuSimple: '107714050_PK-1256398549',
    title: 'Gree Inverter Air conditioner 1.5 ton'
  },
  {
    skuSimple: 'GR678HL0Q02DENAFAMZ-5098883',
    title: 'GS-18LM4 - Gree Air Conditioner - 1.5 Ton - White'
  },
  {
    skuSimple: 'GR678HL1IIQ8YNAFAMZ-5098768',
    title: 'Gree Gree - GS - 12CITH12G - 1.0 ton - Inverter Air Conditioner - Grey'
  }
]