Web Scraper scrapes only 40%-50% of the page

radek · February 28, 2023, 1:03pm

Hi,

I'm trying to scrape all records using this sitemap:

{"_id":"automatyka1","startUrl":["https://www.automatyka.pl/firmy?page=1#paginator"],"selectors":[{"id":"pagination","parentSelectors":["_root","pagination"],"paginationType":"auto","selector":"div.sort:nth-of-type(1) .page-next a","type":"SelectorPagination"},{"id":"company-link","parentSelectors":["element-card"],"type":"SelectorLink","selector":".desc h2 a","multiple":false},{"id":"company-name","parentSelectors":["company-link"],"type":"SelectorText","selector":"div.h1","multiple":false,"regex":""},{"id":"company-description","parentSelectors":["company-link"],"type":"SelectorText","selector":"div.firmDescription","multiple":false,"regex":""},{"id":"company-categories-offer","parentSelectors":["company-link"],"type":"SelectorGroup","selector":".box-medium div.boxContent","extractAttribute":""},{"id":"company-logo","parentSelectors":["company-link"],"type":"SelectorImage","selector":"img.col","multiple":false},{"id":"company-nip","parentSelectors":["company-link"],"type":"SelectorText","selector":"dt:contains('NIP:') + dd","multiple":false,"regex":""},{"id":"company-address","parentSelectors":["company-link"],"type":"SelectorText","selector":".boxContent dd:nth-of-type(1)","multiple":false,"regex":""},{"id":"company-phone1","parentSelectors":["company-link"],"type":"SelectorText","selector":"p:nth-of-type(1) a.shownNumber","multiple":false,"regex":""},{"id":"company-phone2","parentSelectors":["company-link"],"type":"SelectorText","selector":"p:nth-of-type(2) a","multiple":false,"regex":""},{"id":"company-email","parentSelectors":["company-link"],"type":"SelectorElementAttribute","selector":"dd:contains(\"E-mail\") a[href*=\"mail\"]","multiple":false,"extractAttribute":"href"},{"id":"element-card","parentSelectors":["pagination"],"type":"SelectorElement","selector":"div.iBox:nth-of-type(n+2) div.clearfix","multiple":true},{"id":"company-headquarters","parentSelectors":["company-link"],"type":"SelectorText","selector":"dt:contains('\n                        Siedziba:\n                    ') + dd","multiple":false,"regex":""},{"id":"company-website","parentSelectors":["company-link"],"type":"SelectorText","selector":"dt:contains('Strona firmowa: WWW:') + dd a","multiple":false,"regex":""},{"id":"company-reach","parentSelectors":["company-link"],"type":"SelectorText","selector":"dt:contains('Zasięg:') + dd","multiple":false,"regex":""},{"id":"company-offers","parentSelectors":["company-link"],"type":"SelectorText","selector":"dt:contains('\n            Firma oferuje: Oferuje:\n        ') + dd","multiple":false,"regex":""},{"id":"company-subname","parentSelectors":["company-link"],"type":"SelectorText","selector":"h2.p","multiple":false,"regex":""},{"id":"company-opinions","parentSelectors":["company-link"],"type":"SelectorGroup","selector":"div.section","extractAttribute":""},{"id":"company-products-and-services","parentSelectors":["company-link"],"type":"SelectorGroup","selector":"div.clItem","extractAttribute":""},{"id":"company-products-and-services-names-and-links","parentSelectors":["company-link"],"type":"SelectorGroup","selector":".col h3 a","extractAttribute":"href"},{"id":"company-update","parentSelectors":["company-link"],"type":"SelectorText","selector":"small.update","multiple":false,"regex":""},{"id":"company-image1","parentSelectors":["company-link"],"type":"SelectorLink","selector":"li.wider:nth-of-type(1) a","multiple":false},{"id":"company-image2","parentSelectors":["company-link"],"type":"SelectorLink","selector":"li.wider:nth-of-type(2) a","multiple":false},{"id":"company-image3","parentSelectors":["company-link"],"type":"SelectorLink","selector":"li.wider:nth-of-type(3) a","multiple":false},{"id":"company-image4","parentSelectors":["company-link"],"type":"SelectorText","selector":"li.wider:nth-of-type(4) a","multiple":false,"regex":""},{"id":"company-image5","parentSelectors":["company-link"],"type":"SelectorText","selector":"li.wider:nth-of-type(5) a","multiple":false,"regex":""},{"id":"company-image6","parentSelectors":["company-link"],"type":"SelectorText","selector":"li.wider:nth-of-type(6) a","multiple":false,"regex":""},{"id":"company-brands-names-and-links","parentSelectors":["company-link"],"type":"SelectorGroup","selector":".box-soft a:nth-of-type(n+1)","extractAttribute":""}]}

And I have two issues:

how to scrape first-page elements? (the simple navigation navigate directly to the 2nd page, does not scrape elements from the 1st one)
this sitemap scrapes only ca. 40-50% of all records. And I don't know why(?). Request interval and page load where set over 6000 both so, I believe it is not a reason.

I will appreciate your help

Regards,
Radek

leemeng · July 30, 2023, 5:40am

This site's URL changes along with the page number, so you don't really need a paginator. You can use the "range url with increment" method listed under Specify multiple urls with ranges.

In the example below, I have set the range to [1-10], the first 10 pages. You can do some math to figure out the proper range; each page has 20 results.

{"_id":"automatyka","startUrl":["https://www.automatyka.pl/firmy?page=[1-10]#paginator"],"selectors":[{"id":"Result rows","multiple":true,"parentSelectors":["_root"],"selector":"div.iBox:nth-of-type(n+2) div.desc","type":"SelectorElement"},{"id":"Company","multiple":false,"parentSelectors":["Result rows"],"regex":"","selector":"h2 a","type":"SelectorText"},{"id":"Siedziba","multiple":false,"parentSelectors":["Result rows"],"regex":"","selector":"dl dd","type":"SelectorText"},{"id":"Desc","multiple":false,"parentSelectors":["Result rows"],"regex":"","selector":"p span","type":"SelectorText"},{"id":"Link","linkType":"linkFromHref","multiple":false,"parentSelectors":["Result rows"],"selector":"h2 a","type":"SelectorLink"}]}