python - Scrapy, Parse items from 1st page then a follow link to get additional items -
update: able moving, doesn't return subpage , iterate sequence again. data trying extract in table this:
table date_1 | source_1 | link article_1 | date_2 | source_2 | link article_2 | etc....
and need first collect date_1, source_1 , go link article , repeat...
any appreciated. :)
from scrapy.spiders import basespider, rule scrapy.selector import htmlxpathselector scrapy.contrib.linkextractors import linkextractor dirbot.items import websiteloader scrapy.http import request scrapy.http import htmlresponse class dindexspider(basespider): name = "dindex" allowed_domains = ["newslookup.com"] start_urls = [ "http://www.newslookup.com/business/" ] def parse_subpage(self, response): self.log("scraping: " + response.url) il = response.meta['il'] time = response.xpath('//div[@id="update_data"]//td[@class="stime3"]//text()').extract() il.add_value('publish_date', time) yield il.load_item() def parse(self, response): self.log("scraping: " + response.url) hxs = htmlxpathselector(response) sites = hxs.select('//td[@class="article"]') site in sites: il = websiteloader(response=response, selector=site) il.add_xpath('name', 'a/text()') il.add_xpath('url', 'a/@href') yield request("http://www.newslookup.com/business/", meta={'il': il}, callback=self.parse_subpage)
that's because need use crawlspider
class instead of basespider
:
from scrapy.spiders import crawlspider class dindexspider(crawlspider): # ...
Comments
Post a Comment