4.8编写spider完成抓取过程
import scrapy
from urllib import parse
from scrapy import Requestfor post_url in post_urls:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parseDetail)Last updated
import scrapy
from urllib import parse
from scrapy import Requestfor post_url in post_urls:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parseDetail)Last updated
post_nodes = response.css("#archive .post-thumb a")
for post_node in post_nodes:
image_url =post_node.css("img::attr(src)").extract_first("")
post_url = post_node.css("::attr(href)").extract_first("")
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parseDetail,
meta={"image_url": image_url})front_image_url=response.meta.get('front_image_url')# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
pub_time = scrapy.Field()
content = scrapy.Field()
like_num = scrapy.Field()
favorite_num = scrapy.Field()
comment_num = scrapy.Field()
image_url = scrapy.Field()
passfrom ArticleSpider.items import JobBoleArticleItem
class JobboleSpider(scrapy.Spider):
def parseDetail(self, response):
article_item=JobBoleArticleItem()
...
article_item["title"]=title
article_item["url"]=response.url
article_item['image_url'] = [image_url]
...
yield article_item# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ArticleSpider.pipelines.ArticlespiderPipeline': 300
}# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1
}
# 指定item中哪个字段为image的url
IMAGES_URLS_FIELD = "image_url"
# 定义图片存储的位置
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, 'images')
# 指定图片的尺寸
IMAGES_MIN_HEIGHT=100
IMAGES_MIN_WIDTH=100# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
for ok, value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item...
ITEM_PIPELINES = {
...
'ArticleSpider.pipelines.ArticleImagePipeline'1
}
...