Pyspider 및pymysql 간단한 사용 사례

5458 단어
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-01-08 10:21:36
# Project: newv2ex

from pyspider.libs.base_handler import *
import pymysql
import random


class Handler(BaseHandler):
    crawl_config = {
    }

        
    def add_question(self,title,content):
        db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
        try:
            cursor = db.cursor()
            # "%s"
            sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),2)
            print(sql)
            cursor.execute(sql)
            print(cursor.lastrowid)
            db.commit()
        except:
            db.rollback()
    
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
            self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)

    @config(priority=2)
    def tab_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
            self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
            
    @config(priority=2)
    def board_page(self, response):
        for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
            url = each.attr.href
            if url.find('#reply')>0:
                url = url[0:url.find('#')]
            self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
        for each in response.doc('a.page_normal').items():
            self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
        
    @config(priority=2)
    def detail_page(self, response):
        title = response.doc('h1').text()
        content = response.doc('div.topic_content').text()
        #insert into MySQL
        self.add_question(title,content)
        return {
            "url": response.url,
            "title": response.doc('h1').text(),
            "content": response.doc('div.topic_content').text()
        }
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-01-08 20:58:58
# Project: newzhihu

from pyspider.libs.base_handler import *
import pymysql
import random

class Handler(BaseHandler):
    crawl_config = {
        'headers': {
            'User-Agent': 'GoogleBot',
        }
    }        

    def add_question(self,title,content,comment_count):
        db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
        try:
            cursor = db.cursor()
            # "%s"
            sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),comment_count)
            print(sql)
            cursor.execute(sql)
            qid = cursor.lastrowid
            db.commit()
            print(qid)
            return qid
        except:
            db.rollback()
        return 0
            
    def add_comment(self,qid,comment):
        db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
        try:
            cursor = db.cursor()
            # "%s"
            sql = 'insert into comment(content, entity_type, entity_id, user_id, created_date) values ("%s",%d,%d, %d,now())' % (comment, 1, qid, random.randint(1, 10));
            print(sql)
            cursor.execute(sql)
            
            #qid = cursor.lastrowid
            #print(qid)
            
            db.commit()
        except:
            db.rollback()
    
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.zhihu.com/topic/19550517/top-answers', callback=self.index_page, validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a.question_link').items():
            self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
        for each in response.doc('div.zm-invite-pager span a').items():
            self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)

    @config(priority=2)
    def detail_page(self, response):
        items = response.doc('span.RichText.CopyrightRichText-richText').items()
        title = response.doc('h1.QuestionHeader-title').text()
        content = response.doc('div.QuestionHeader-detail').text()

        qid = self.add_question(title, content, sum(1 for x in items))
        for each in response.doc('span.RichText.CopyrightRichText-richText').items():
            self.add_comment(qid, each.text())

        return {
            "url": response.url,
            "title": title,
            "content": content,
        }

좋은 웹페이지 즐겨찾기