Website: https://www.indcareer.com/find/all-colleges-in-maharashtra
I am trying to get a excel format of the links which are present within the link of the above website but after running this program the data is eing scraped in a random order. What should i change so it sequentially traverses the link?
import scrapy
class CollegedetailsSpider(scrapy.Spider):
name="link_list"
start_urls=["https://www.indcareer.com/find/all-colleges-in-maharashtra"]
def parse(self,response):
for coll in response.xpath("//div[@class='media']"):
next_coll= coll.xpath(".//h4/a/@href").extract_first()
next_coll_link=response.urljoin(next_coll)
yield scrapy.Request(url=next_coll_link,callback=self.parse)
if response.xpath(".//tr/td/a[@rel='nofollow']/@href") is not None:
yield{
'College_Link': response.xpath(".//tr/td/a[@rel='nofollow']/@href").extract_first()
}
next_page=response.xpath("//li[@class='pager-next']/a/@href").extract_first()
if next_page is not None:
next_page_link=response.urljoin(next_page)
yield scrapy.Request(url=next_page_link,callback=self.parse)