입문급 파충류 콩잎 캡처 top250 영화 정보

3583 단어 파충류
import requests
import lxml.html
from bs4 import BeautifulSoup
import re
import bs4
from pymongo import MongoClient

def req(url, param):
    resp = requests.get(url, params=param).text
    return resp

def get_data(data):
    # 
    source_soup = BeautifulSoup(data, 'html.parser')
    data_ol = source_soup.ol
    films = []
    for tag_li in data_ol:
        if isinstance(tag_li, bs4.element.Tag):
            datas = lxml.html.fromstring(str(tag_li.contents))
            # 
            names = []
            name1 = datas.xpath('//span[@class="title"]/text()')
            name2 = datas.xpath('//span[@class="other"]/text()')
            names.append(name1)
            names.append(name2)
            # 
            info = datas.xpath('//p[@class=""]/text()')
            # 
            star = datas.xpath('//span[@class="rating_num"]/text()')
            num = re.search('(.*)', str(data_ol.contents)).group(1)
            # 
            quote = datas.xpath('//span[@class="inq"]/text()')
            # 
            film_info = {
                'name': names,
                'info': info,
                'star': star,
                'num': num,
                'quote': quote
            }
            films.append(film_info)

    return films

cli = MongoClient('localhost', 27017)
db = cli.films
for i in range(1, 11):
    param = {
        'start': (i - 1) * 25,
        'filter': ""
    }
    url = 'https://movie.douban.com/top250'
    db.films2.insert(get_data(req(url, param)))
print("spider success")

bs4,lxml을 사용합니다.html.xpath,requests는 시청자 여러분의 많은 가르침을 바랍니다.

좋은 웹페이지 즐겨찾기