selenium + phantomJS 기어 가기 (콩짜개 독서)

3357 단어 selenium
selenium + phantomJS 기어 가기 (콩짜개 독서)
from selenium import webdriver import time from lxml import etree import pymysql import re
함수 만 들 기
def my_brower (url, page): \ # 브 라 우 저 대상 브 라 우 저 가 져 오기 = webdriver. PhantomJS (executable path = r 'C: \ \ Users \ \ Administrator \ Desktop \ \ phantomjs 2.1.1 windows \ bin \ phantomjs. exe')
#         
browers.get(url)

#   2 ,      ,       ,      
# time.sleep(2)

#       
html = browers.page_source

#         
parse_html(html)

페이지 정보 분석
def parse_html (html): \ # xpath 대상 html = etree. HTML (html) 생 성
#                       contains             sc-bZQynM
books = html.xpath("//div[contains(@class,'sc-bZQynM')]")

#        ,           
for book in books:
    #             
    book_dict = {}

    #       
    book_dict['book_pic'] = book.xpath(".//img/@src")[0]
    #   
    book_name = book.xpath(".//div[@class='title']/a[@class='title-text']/text()")[0]

    if '"' in book_name:
        pattern = re.compile(r'"')
        book_name = pattern.sub('', book_name)
    if "'" in book_name:
        pattern = re.compile(r"'")
        book_name = pattern.sub('', book_name)
        #           \,           \  sql         ,
        #           
    if '\\' in book_name:
        book_name = book_name[:-1]
    book_dict['book_name'] = book_name

    #        
    book_dict['book_url'] = book.xpath(".//div[@class='title']/a[@class='title-text']/@href")[0]
    #     
    book_dict['book_score'] = book.xpath(".//span[@class='rating_nums']/text()")[0]
    #       、  、   、    
    all_div = str(book.xpath("./div[@class='item-root']/div[@class='detail']/div[@class='meta abstract']/text()")[0]).split('/')
    print(all_div)
    book_dict['book_price'] = all_div[-1]
    book_dict['book_date'] = all_div[-2]
    book_dict['book_author'] = ','.join(all_div[:-3])
    book_dict['book_detail'] = all_div[-3]

    print(book_dict)
    #         
    insert_mysql(book_dict)

def insert_mysql(book_dict):
#     
db = pymysql.connect(host = 'localhost',port = 3306,password = '1234',user = 'root',db = 'test',charset='utf8')
#          
cur = db.cursor()

pic= book_dict['book_pic']
name = book_dict['book_name']
url = book_dict['book_url']
score = book_dict['book_score']
price = book_dict['book_price']
date = book_dict['book_date']
author = book_dict['book_author']
detail = book_dict['book_detail']


sql = 'insert python_book values ("%s","%s","%s","%s","%s","%s","%s","%s");'%(pic,pymysql.escape_string(name),url,score,price,date,author,detail)
#         
cur.execute(sql)

#   
db.commit()

if name = = = 'main': for i in range (0, 199): print ('= = = = 출력 {} 페이지 ='. format (((i + 1)) page = i * 15 baseurl = ‘https://book.douban.com/subject_search?search_text=python&cat=1001&start=’ + str(page) my_brower(base_url,page)

좋은 웹페이지 즐겨찾기