Can't scrape multiple review pages

Hi there,

I'm trying to scrape customer reviews for restaurants on Yelp. My scraper works partly, but seems to stop well before it's finished scraping all reviews for each restaurant. And I'm also getting duplicates of each review in my csv.

Process:

Yelp homepage for "Restaurants in Montreal" (Start URL)
https://www.yelp.ca/search?cflt=restaurants&find_loc=Montreal%2C+QC%2C+Canada

  1. Access restaurant link from homepage to scrape customer review data

  2. Here you can see I created a review-wrapper to allow me to access the same data for each review element + simple scrape action for title of shop

  3. I need the scraper to paginate through the homepage for all the different pages of the restaurant listings and links, and also paginate through the restaurant link, to access every review per restaurant.

Where am I going wrong? I appreciate any insight and am happy to clarify if anything isn't clear with my process.

Here's my sitemap:

{"_id":"mtl-rests-reviews","startUrl":["https://www.yelp.ca/search?cflt=restaurants&find_loc=Montreal%2C+QC%2C+Canada"],"selectors":[{"id":"shop","type":"SelectorLink","parentSelectors":["_root","pages"],"selector":"li:nth-of-type(n+8) .css-1pxmz4g a","multiple":true,"delay":0},{"id":"shop-name (key)","type":"SelectorText","parentSelectors":["shop"],"selector":"h1","multiple":false,"regex":"","delay":0},{"id":"review-wrapper","type":"SelectorElement","parentSelectors":["shop","review_pages"],"selector":"div.review__373c0__13kpL","multiple":true,"delay":0},{"id":"customer-name","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"a.css-166la90","multiple":false,"regex":"","delay":0},{"id":"customer-location","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"span.css-n6i4z7","multiple":false,"regex":"","delay":0},{"id":"rating","type":"SelectorElementAttribute","parentSelectors":["review-wrapper"],"selector":"div.i-stars__373c0__1T6rz","multiple":false,"extractAttribute":"aria-label","delay":0},{"id":"review-date","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"span.css-e81eai","multiple":false,"regex":"","delay":0},{"id":"review-text","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"span.raw__373c0__3rcx7","multiple":false,"regex":"","delay":0},{"id":"pages","type":"SelectorLink","parentSelectors":["_root"],"selector":"a.pagination-link-component__09f24__H0ICg","multiple":true,"delay":0},{"id":"review_pages","type":"SelectorLink","parentSelectors":["_root","shop"],"selector":"a.pagination-link-component__373c0__1fUdr","multiple":true,"delay":0}]}

Hi @duncss

Your sitemap structure seems to be kind of messy.

Try my solution:

{"_id":"yelp-details-page","startUrl":["https://www.yelp.ca/search?cflt=restaurants&find_loc=Montreal%2C+QC%2C+Canada"],"selectors":[{"id":"listing-pagination-next","type":"SelectorLink","parentSelectors":["_root","listing-pagination-next"],"selector":"[class*=\"pagination-links\"] a:has(span[class*=\"chevron-right\"]):nth(0)","multiple":true,"delay":0},{"id":"listing-element","type":"SelectorElement","parentSelectors":["_root","listing-pagination-next"],"selector":"div[class*=\"leftRailSearchResultsContainer\"] div[class*=\"hoverable\"]:has(div[class*=\"businessName\"])","multiple":true,"delay":0},{"id":"yelp-link","type":"SelectorLink","parentSelectors":["listing-element"],"selector":"h4:has(span) a","multiple":false,"delay":0},{"id":"order","type":"SelectorText","parentSelectors":["listing-element"],"selector":"h4 span","multiple":false,"regex":"^\\d+","delay":0},{"id":"business","type":"SelectorElement","parentSelectors":["yelp-link"],"selector":"html:has(div[data-hypernova-key*=\"BizDetailsApp\"])","multiple":true,"delay":0},{"id":"yelp_page_url","type":"SelectorElementAttribute","parentSelectors":["business"],"selector":"meta[property=\"og:url\"]","multiple":false,"extractAttribute":"content","delay":0},{"id":"name","type":"SelectorText","parentSelectors":["business"],"selector":"h1","multiple":false,"regex":"","delay":0},{"id":"review_rating","type":"SelectorElementAttribute","parentSelectors":["business"],"selector":"div[class*=\"i-stars\"]","multiple":false,"extractAttribute":"aria-label","delay":0},{"id":"review_count","type":"SelectorText","parentSelectors":["business"],"selector":"div:has(> span > div[class*=\"i-stars\"]) + div","multiple":false,"regex":"","delay":0},{"id":"address","type":"SelectorText","parentSelectors":["business"],"selector":"div[class*=\"stickySidebar\"] p:has(a[href*=\"/map/\"]) > p","multiple":false,"regex":"","delay":0},{"id":"phone","type":"SelectorText","parentSelectors":["business"],"selector":"div[class*=\"stickySidebar\"] div:has(+ div > span[class*=\"phone\"]) > p:nth-of-type(2)","multiple":false,"regex":"","delay":0},{"id":"website","type":"SelectorText","parentSelectors":["business"],"selector":"[data-hypernova-key]:contains('linkText')","multiple":false,"regex":"(?<=linkText\":\")[^\"]+","delay":0},{"id":"monday_hours","type":"SelectorText","parentSelectors":["business"],"selector":"tr:contains(\"Mon\") td p","multiple":false,"regex":"","delay":0},{"id":"tuesday_hours","type":"SelectorText","parentSelectors":["business"],"selector":"tr:contains(\"Tue\") td p","multiple":false,"regex":"","delay":0},{"id":"wednesday_hours","type":"SelectorText","parentSelectors":["business"],"selector":"tr:contains(\"Wed\") td p","multiple":false,"regex":"","delay":0},{"id":"thursday_hours","type":"SelectorText","parentSelectors":["business"],"selector":"tr:contains(\"Thu\") td p","multiple":false,"regex":"","delay":0},{"id":"friday_hours","type":"SelectorText","parentSelectors":["business"],"selector":"tr:contains(\"Fri\") td p","multiple":false,"regex":"","delay":0},{"id":"saturday_hours","type":"SelectorText","parentSelectors":["business"],"selector":"tr:contains(\"Sat\") td p","multiple":false,"regex":"","delay":0},{"id":"sunday_hours","type":"SelectorText","parentSelectors":["business"],"selector":"tr:contains(\"Sun\") td p","multiple":false,"regex":"","delay":0},{"id":"review-wrapper","type":"SelectorElement","parentSelectors":["business"],"selector":"section[aria-label=\"Recommended Reviews\"] > div + div > div > ul > li","multiple":true,"delay":0},{"id":"customer-name","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":".fs-block a","multiple":false,"regex":"","delay":0},{"id":"customer-location","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"div.user-passport-info > div > span","multiple":false,"regex":"","delay":0},{"id":"review-text","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"> div > div + div:has(p):nth(0)","multiple":false,"regex":"","delay":0},{"id":"review-rating","type":"SelectorElementAttribute","parentSelectors":["review-wrapper"],"selector":"> div > div + div:has(div[role=\"img\"]) > div > div:nth(0) > span > div","multiple":false,"extractAttribute":"aria-label","delay":0},{"id":"review-date","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"> div > div + div:has(div[role=\"img\"]) > div > div:nth(1) > span","multiple":false,"regex":"","delay":0}]}

Hope it helps.

Thanks for the response. I'm trying this out now! Will let you know if it works.

Best,
D

Still not catching all of the reviews but certainly more than before.

Thanks

Oh, you want to grab all of the reviews?!

I'm not sure if the extension will be able to handle this kind of amohnt of records. I would rather test it on cloud - Web Scraper

But this should work:

{"_id":"yelp-details-page","startUrl":["https://www.yelp.ca/search?cflt=restaurants&find_loc=Montreal%2C+QC%2C+Canada"],"selectors":[{"id":"listing-pagination-next","type":"SelectorLink","parentSelectors":["_root","listing-pagination-next"],"selector":"[class*=\"pagination-links\"] a:has(span[class*=\"chevron-right\"]):nth(0)","multiple":true,"delay":0},{"id":"listing-element","type":"SelectorElement","parentSelectors":["_root","listing-pagination-next"],"selector":"div[class*=\"leftRailSearchResultsContainer\"] div[class*=\"hoverable\"]:has(div[class*=\"businessName\"])","multiple":true,"delay":0},{"id":"yelp-link","type":"SelectorLink","parentSelectors":["listing-element"],"selector":"h4:has(span) a","multiple":false,"delay":0},{"id":"business","type":"SelectorElement","parentSelectors":["yelp-link"],"selector":"html:has(div[data-hypernova-key*=\"BizDetailsApp\"])","multiple":true,"delay":0},{"id":"review-wrapper","type":"SelectorElement","parentSelectors":["business","review-pagination"],"selector":"section[aria-label=\"Recommended Reviews\"] > div + div > div > ul > li","multiple":true,"delay":0},{"id":"customer-name","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":".fs-block a","multiple":false,"regex":"","delay":0},{"id":"customer-location","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"div.user-passport-info > div > span","multiple":false,"regex":"","delay":0},{"id":"review-text","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"> div > div + div:has(p):nth(0)","multiple":false,"regex":"","delay":0},{"id":"review-rating","type":"SelectorElementAttribute","parentSelectors":["review-wrapper"],"selector":"> div > div + div:has(div[role=\"img\"]) > div > div:nth(0) > span > div","multiple":false,"extractAttribute":"aria-label","delay":0},{"id":"review-date","type":"SelectorText","parentSelectors":["review-wrapper"],"selector":"> div > div + div:has(div[role=\"img\"]) > div > div:nth(1) > span","multiple":false,"regex":"","delay":0},{"id":"review-pagination","type":"SelectorLink","parentSelectors":["business","review-pagination"],"selector":"[class*=\"pagination-links\"] a:has(span[class*=\"chevron-right\"]):nth(0)","multiple":true,"delay":0}]}