python 도굴 노트 파충류 scrapyredis -- MongoDB 저장 소
14108 단어 극 객 대학 python 과 학
:
:http://www.daomubiji.com/
:
, :
MongoDB
####################################
redis
:
settings :
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
items :
text = Field()#
인터넷 조회 의 코드 형식 은 참고 하 시기 바 랍 니 다.
#-*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
from novelspider.items import NovelspiderItem
import json
class novelSpider(CrawlSpider):
name = 'novelSpider'
redis_key = 'novelSpider:start_urls'
start_urls = ['http://www.daomubiji.com/']
def parse(self, response):
'''
:param response:
:return:
'''
selector = Selector(response)
section = selector.xpath('//article')
bookUrls = section.xpath('p/a/@href').extract()
print bookUrls
for eachUrl in bookUrls:
yield Request(eachUrl, callback=self.parse_news)
def parse_news(self, response):
'''
, , , 、
:param response:
:return:
'''
selector = Selector(response)
content = selector.xpath('/html/body/section/div[2]/div/article/a/text()').extract()
urls = selector.xpath('/html/body/section/div[2]/div/article/a/@href').extract()
item = NovelspiderItem()
bookName = content[0]
shahai_flg = False
if bookName.split(' ')[0] == u' 1':
shahai_flg = True
else:
shahai_flg = False
mhgc_flg = False
if bookName.split(' ')[0] == u' ( )':
mhgc_flg = True
else:
mhgc_flg = False
i = 0
for each in content:
try:
if shahai_flg:
item['bookName'] = each.split(' ')[0] + each.split(' ')[1]
item['chapterNum'] = each.split(' ')[2]
item['bookTitle'] = each.split(' ')[3]
item['chapterURL'] = urls[i]
elif mhgc_flg:
item['bookName'] = u' ' + each.split(' ')[0]
item['chapterNum'] = each.split(' ')[2]
item['bookTitle'] = each.split(' ')[3]
item['chapterURL'] = urls[i]
else:
item['bookName'] = each.split(' ')[0]
item['chapterNum'] = each.split(' ')[1]
item['bookTitle'] = each.split(' ')[2]
item['chapterURL'] = urls[i]
i += 1
except Exception, e:
continue
yield item
#encoding=utf-8
# from scrapy_redis.spiders import RedisSpider
from scrapy.selector import Selector
from novelspider.items import NovelspiderItem
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider
import re
class novSpider(RedisSpider):
name = 'novelspider'
redis_key = 'novelspider:start_urls'
start_urls = [
'http://www.daomubiji.com/dao-mu-bi-ji-1'
]
print start_urls
def parse(self,response):
selector = Selector(response)#
print response.body
bookName = selector.xpath('//h1[@class ="focusbox-title"]/text()').extract()[0]
# print bookName
url = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/@href').extract()
# print url
excerpts = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/text()').extract()
print excerpts
item = NovelspiderItem()
for i in range(len(url)):
item['bookName'] = bookName
item['chapterURL'] = url[i]
try:
item['bookTitle'] = excerpts[i].split(' ')[0]
item['chapterNum'] = excerpts[i].split(' ')[1]
item['chapterName'] = excerpts[i].split(' ')[2]
# item['chapterName2'] = excerpts[i].split(' ')[3]
# item['chapterName3'] = excerpts[i].split(' ')[4]
except Exception,e:
continue
for eachUrl in url:
yield Request(eachUrl, callback=self.parseContent, meta={'item': item})
# yield Request(self.url[i], callback='parseContent', meta={'item': item})
#eta={'item': item} ( )
def parseContent(self,response):
selector = Selector(response)
item = response.meta('item')
html = selector.xpath('//div[@class="content"]').extract()[0]
textField = re.search('(.*?) ',html,re.S).group(1)
text = re.findall('(.*?)
',textField,re.S)
print text
fulltext = ''
for each in text:
fulltext += each
item['text'] = fulltext
yield item