python 도굴 노트 파충류 scrapyredis -- MongoDB 저장 소

    :        
    :http://www.daomubiji.com/
    :
             ,      :
           
          
           
           MongoDB 
####################################
      redis
  :      

settings   :
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379

items  :
   text = Field()#         

인터넷 조회 의 코드 형식 은 참고 하 시기 바 랍 니 다.
#-*- coding: utf-8 -*-

from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
from novelspider.items import NovelspiderItem
import json


class novelSpider(CrawlSpider):
    name = 'novelSpider'
    redis_key = 'novelSpider:start_urls'
    start_urls = ['http://www.daomubiji.com/']


    def parse(self, response):
        '''
                      
        :param response:
        :return:
        '''
        selector = Selector(response)
        section = selector.xpath('//article')
        bookUrls = section.xpath('p/a/@href').extract()
        print bookUrls
        for eachUrl in bookUrls:
            yield Request(eachUrl, callback=self.parse_news)

    def parse_news(self, response):
        '''
              ,    ,  ,  、          
        :param response:
        :return:
        '''
        selector = Selector(response)
        content = selector.xpath('/html/body/section/div[2]/div/article/a/text()').extract()
        urls = selector.xpath('/html/body/section/div[2]/div/article/a/@href').extract()
        item = NovelspiderItem()

        bookName = content[0]

        shahai_flg = False
        if bookName.split(' ')[0] == u'  1':
            shahai_flg = True
        else:
            shahai_flg = False

        mhgc_flg = False
        if bookName.split(' ')[0] == u'    ( )':
            mhgc_flg = True
        else:
            mhgc_flg = False

        i = 0
        for each in content:
            try:
                if shahai_flg:
                    item['bookName'] = each.split(' ')[0] + each.split(' ')[1]
                    item['chapterNum'] = each.split(' ')[2]
                    item['bookTitle'] = each.split(' ')[3]
                    item['chapterURL'] = urls[i]
                elif mhgc_flg:
                    item['bookName'] = u'    ' + each.split(' ')[0]
                    item['chapterNum'] = each.split(' ')[2]
                    item['bookTitle'] = each.split(' ')[3]
                    item['chapterURL'] = urls[i]
                else:
                    item['bookName'] = each.split(' ')[0]
                    item['chapterNum'] = each.split(' ')[1]
                    item['bookTitle'] = each.split(' ')[2]
                    item['chapterURL'] = urls[i]
                i += 1
            except Exception, e:
                continue
            yield item

#encoding=utf-8

# from scrapy_redis.spiders import RedisSpider
from scrapy.selector import Selector
from novelspider.items import NovelspiderItem
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider
import re

class novSpider(RedisSpider):
    name = 'novelspider'
    redis_key = 'novelspider:start_urls'
    start_urls = [
                  'http://www.daomubiji.com/dao-mu-bi-ji-1'

                  ]
    print start_urls

    def parse(self,response):
        selector = Selector(response)#        
        print response.body
        bookName = selector.xpath('//h1[@class ="focusbox-title"]/text()').extract()[0]
        # print bookName
        url = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/@href').extract()
        # print url
        excerpts = selector.xpath('//article[@class="excerpt excerpt-c3"]/a/text()').extract()
        print excerpts
        item = NovelspiderItem()
        for i in range(len(url)):
            item['bookName'] = bookName
            item['chapterURL'] = url[i]
            try:
                item['bookTitle'] = excerpts[i].split(' ')[0]
                item['chapterNum'] = excerpts[i].split(' ')[1]
                item['chapterName'] = excerpts[i].split(' ')[2]
                # item['chapterName2'] = excerpts[i].split(' ')[3]
                # item['chapterName3'] = excerpts[i].split(' ')[4]
            except Exception,e:
                continue
        for eachUrl in url:
            yield Request(eachUrl, callback=self.parseContent, meta={'item': item})
            # yield Request(self.url[i], callback='parseContent', meta={'item': item})
            #eta={'item': item}        (  )           

    def parseContent(self,response):
        selector = Selector(response)
        item = response.meta('item')
        html = selector.xpath('//div[@class="content"]').extract()[0]
        textField = re.search('
(.*?)
'
,html,re.S).group(1) text = re.findall('

(.*?)

'
,textField,re.S) print text fulltext = '' for each in text: fulltext += each item['text'] = fulltext

yield item

좋은 웹페이지 즐겨찾기