一、背景:原程序爬取小说要求一次成功,否则,必须从头再来,影响爬取效率。
二、完善思路
(1)增加对已爬取内容的检索,若mongodb已有内容,则不再爬取。
(2)增加对总爬取时间的计时。
三、代码
(1)xbiquge/pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
import time
from twisted.enterprise import adbapi
from pymongo import MongoClient
class XbiqugePipeline(object):
conn = MongoClient('mongodb://admin:admin@localhost:27017/admin')
db = conn.novels #建立数据库novels的连接对象db
name_novel = ''
url_firstchapter = ''
name_txt = ''
start_time=time.time()
#定义类初始化动作
def __init__(self):
return
#爬虫开始
def open_spider(self, spider):
return
def get_collection(self,name_collection): #获取数据集cursor对象
myset = self.db[name_collection]
return myset
def process_item(self, item, spider):
#if self.name_novel == '':
self.name_novel = item['name']
self.url_firstchapter = item['url_firstchapter']
self.name_txt = item['name_txt']
myset = self.db[self.name_novel]
myset.insert_one(dict(item))
# if self.name_novel != '':
# exec('self.db.'+ self.name_novel + '.insert_one(dict(item))')
return item
#从数据库取小说章节内容写入txt文件
def content2txt(self,dbname,firsturl,txtname):
myset = self.db[dbname]
record_num = myset.find().count() #获取小说章节数量
print("小说总章节数:",record_num)
counts=record_num
url_c = firsturl
start_time=time.time() #获取提取小说内容程序运行的起始时间
f = open(txtname+".txt", mode='w', encoding='utf-8') #写方式打开小说名称加txt组成的文件
for i in range(counts): #括号中为counts
#-----------使用count()方法获得的返回整型值作为是否获得数据的判断依据-------------
# record_m_count=myset.find({"url": url_c},{"content":1,"_id":0}).count()
# if record_m_count == 0:
# print("数据集中没有找到章节内容。
出错url:",url_c)
# break
#--------------------------------------------------------------------------------
#-----------使用next()方法读取迭代器数据,并使用try except捕获未获得数据的错误-----
try:
record_m=myset.find({"url": url_c},{"content":1,"_id":0}).next()
#except Exception as e:
except StopIteration:
print("数据集中没有获得章节内容。
出错url:",url_c)
break #跳出for循环,终止小说文件生成
#--------------------------------------------------------------------------------
record_content_c2a0 = ''
#------------使用for循环读取迭代器数据模式---------------------------------
# record_i = myset.find({"url": url_c},{"content":1,"_id":0})
# for record_m in record_i:
# record_content_c2a0 = record_m["content"] #获取小说章节内容
#---------------------------------------------------------------------------
record_content_c2a0 = record_m["content"]
#record_content=record_content_c2a0.replace(u'xa0', u'') #消除特殊字符xc2xa0
record_content=record_content_c2a0
#print(record_content)
f.write('
')
f.write(record_content + '
')
f.write('
')
url_ct = myset.find({"url": url_c},{"next_page":1,"_id":0}) #获取下一章链接的查询对象
for item_url in url_ct:
url_c = item_url["next_page"] #下一章链接地址赋值给url_c,准备下一次循环。
#print("下一页",url_c)
f.close()
print("文件生成用时:",time.time()-start_time)
print("小说爬取总用时:",time.time()-self.start_time)
print(txtname + ".txt" + " 文件已生成!")
return
#爬虫结束,调用content2txt方法,生成txt文件
def close_spider(self,spider):
if self.name_novel !='' and self.url_firstchapter != '' and self.name_txt != '':
self.content2txt(self.name_novel,self.url_firstchapter,self.name_txt)
return
(2)爬虫示例代码xbiquge/spiders/sancun.py
# -*- coding: utf-8 -*-
import scrapy
from xbiquge.items import XbiqugeItem
from xbiquge.pipelines import XbiqugePipeline
class SancunSpider(scrapy.Spider):
name = 'sancun'
allowed_domains = ['www.xbiquge.la']
#start_urls = ['http://www.xbiquge.la/10/10489/']
url_ori= "https://www.xbiquge.la"
url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html"
name_txt = "./novels/三寸人间"
url_chapters = url_firstchapter[0:32]
pipeline=XbiqugePipeline()
novelcollection=pipeline.get_collection(name) #获取小说数据集cursor对象,mongodb的数据集(collection)相当于mysql的数据表table
#--------------------------------------------
#如果next_page的值是小说目录页面url,则把包含目录页面的记录删除,以免再次抓取时,出现多>个目录页面url,使得无法获得最新内容。
if novelcollection.find({"next_page":url_chapters}).count() != 0 :
print("包含目录页面url的记录数:",novelcollection.find({"next_page":url_chapters}).count())
novelcollection.remove({"next_page":url_chapters})
print("已删除包含目录页面url的记录。")
#--------------------------------------------
novelcounts=novelcollection.find().count()
novelurls=novelcollection.find({},{"_id":0,"id":1,"url":1})
item = XbiqugeItem()
item['id'] = novelcounts #id置初值为colletion的记录总数
item['name'] = name
item['url_firstchapter'] = url_firstchapter
item['name_txt'] = name_txt
def start_requests(self):
start_urls = [self.url_chapters]
print("小说目录url:",start_urls)
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): #网页提取数据,并与mongodb数据集比较,没有相同的数据才从网页抓取。
count_bingo=0 #数据集中已有记录的条数
dl = response.css('#list dl dd') #提取章节链接相关信息
for dd in dl:
count_iterator = 0
self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[0] #组合形成小说的各章节链接
#print("网页提取url:", self.url_c)
self.novelurls=self.novelcollection.find({},{"_id":0,"id":1,"url":1}) #通过重新赋值迭代器来重置迭代器指针,使for循环能够从头遍历迭代器。
for url in self.novelurls:
#print("mongodb提取url:", url)
if url["url"]==self.url_c: #如果数据集中找到与网页提取的url值相同,则跳出循环
count_bingo += 1
count_iterator += 1
break
if count_iterator != 0 : #如果有命中结果,则继续下一个循环,不执行爬取动作
continue
print("爬取url:",self.url_c)
#yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
yield scrapy.Request(self.url_c, callback=self.parse_c) #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
#print(self.url_c)
print("数据集已有记录数count_bingo:",count_bingo)
def parse_c(self, response):
self.item['id'] += 1
self.item['url'] = response.url
self.item['preview_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[1]
self.item['next_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[3]
title = response.css('.con_top::text').extract()[4]
contents = response.css('#content::text').extract()
text=''
for content in contents:
text = text + content
#print(text)
self.item['content'] = title + "
" + text.replace('15', '
') #各章节标题和内容组合成content数据,15是^M的八进制表示,需要替换为换行符。
yield self.item #以生成器模式(yield)输出Item对象的内容给pipelines模块。
if self.item['url'][32:39] == self.item['next_page'][32:39]: #同一章有分页的处理
self.url_c = self.item['next_page']
yield scrapy.Request(self.url_c, callback=self.parse_c)

