Pyspider 및pymysql 간단한 사용 사례
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-01-08 10:21:36
# Project: newv2ex
from pyspider.libs.base_handler import *
import pymysql
import random
class Handler(BaseHandler):
crawl_config = {
}
def add_question(self,title,content):
db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
try:
cursor = db.cursor()
# "%s"
sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),2)
print(sql)
cursor.execute(sql)
print(cursor.lastrowid)
db.commit()
except:
db.rollback()
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)
@config(priority=2)
def tab_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
@config(priority=2)
def board_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
url = each.attr.href
if url.find('#reply')>0:
url = url[0:url.find('#')]
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
for each in response.doc('a.page_normal').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
@config(priority=2)
def detail_page(self, response):
title = response.doc('h1').text()
content = response.doc('div.topic_content').text()
#insert into MySQL
self.add_question(title,content)
return {
"url": response.url,
"title": response.doc('h1').text(),
"content": response.doc('div.topic_content').text()
}
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-01-08 20:58:58
# Project: newzhihu
from pyspider.libs.base_handler import *
import pymysql
import random
class Handler(BaseHandler):
crawl_config = {
'headers': {
'User-Agent': 'GoogleBot',
}
}
def add_question(self,title,content,comment_count):
db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
try:
cursor = db.cursor()
# "%s"
sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),comment_count)
print(sql)
cursor.execute(sql)
qid = cursor.lastrowid
db.commit()
print(qid)
return qid
except:
db.rollback()
return 0
def add_comment(self,qid,comment):
db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
try:
cursor = db.cursor()
# "%s"
sql = 'insert into comment(content, entity_type, entity_id, user_id, created_date) values ("%s",%d,%d, %d,now())' % (comment, 1, qid, random.randint(1, 10));
print(sql)
cursor.execute(sql)
#qid = cursor.lastrowid
#print(qid)
db.commit()
except:
db.rollback()
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.zhihu.com/topic/19550517/top-answers', callback=self.index_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a.question_link').items():
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
for each in response.doc('div.zm-invite-pager span a').items():
self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)
@config(priority=2)
def detail_page(self, response):
items = response.doc('span.RichText.CopyrightRichText-richText').items()
title = response.doc('h1.QuestionHeader-title').text()
content = response.doc('div.QuestionHeader-detail').text()
qid = self.add_question(title, content, sum(1 for x in items))
for each in response.doc('span.RichText.CopyrightRichText-richText').items():
self.add_comment(qid, each.text())
return {
"url": response.url,
"title": title,
"content": content,
}
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
다양한 언어의 JSONJSON은 Javascript 표기법을 사용하여 데이터 구조를 레이아웃하는 데이터 형식입니다. 그러나 Javascript가 코드에서 이러한 구조를 나타낼 수 있는 유일한 언어는 아닙니다. 저는 일반적으로 '객체'{}...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.