xpath를 이용하여 그림 다운로드

7161 단어
import requests
from lxml import etree
import os

def download_img(img_url, referer):
    print(img_url)
    headers = {
        #'Cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c = 1534726766;Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c = 1534727069',
        'referer': referer,
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    print(headers)
    if os.path.exists('download'):
        pass
    else:
        os.mkdir('download')
    filename = 'download/'+ img_url.split('/')[-1]
    #request.urlretrieve(img_url, filename)
    response = requests.get(img_url, headers = headers)
    with open(filename, 'wb') as f:
        f.write(response.content)

def parse_detailed_page(url_href):

    #for i in range(1, )
    response = requests.get(url_href)
    html_ele = etree.HTML(response.text)
    max_page = html_ele.xpath('//div[@class="pagenavi"]/a/span/text()')[-2]
    print(max_page)
    for i in range(1, int(max_page)+1):
        page_url = url_href + '/' + str(i)
        response  = requests.get(page_url)
        html_ele = etree.HTML(response.text)
        img_url = html_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
        #print(img_url)
        download_img(img_url, url_href)


url = 'http://www.mzitu.com/'
response = requests.get(url)

# with open('mzitu.html', 'wb') as f:
#     f.write(response.content)

html_ele = etree.HTML(response.text)
href_list = html_ele.xpath('//ul[@id="pins"]/li/a/@href')
for href in href_list:
    # print(href)
    parse_detailed_page(href)
--------------------------------------------------------------------
import requests
import re
from lxml import etree
from urllib import request
import os

for i in range(1,121):

    url = 'http://www.mzitu.com/xinggan/page/%s/' %i
    respomse = requests.get(url)
    html =respomse.text
    html_ele = etree.HTML(html)
    # print(html)
    html_ele_lis = html_ele.xpath('//ul[@id="pins"]/li')
    # print(html_ele_lis)
    for li_list in html_ele_lis:
        title = li_list.xpath('./span[1]/a')[0].text
        # print(title)
        if not os.path.exists(title):
            os.mkdir(title)
        pull = li_list.xpath('./a/@href')[0]
        # print(pull)
        respomse2 = requests.get(pull)
        html2 = respomse2.text
        html2_ele = etree.HTML(html2)
        url2  = html2_ele.xpath('//div[@class="pagenavi"]/a/@href')[-2].split('/')[-1]
        # print(url2)
        #      
        for i in range(1,int(url2)+1):
            url3 = pull + '/%s' % i
            # print(url3)
            response3 = requests.get(url3)
            html4 = response3.text
            html4_ele = etree.HTML(html4)
            img_url = html4_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
            # print(img_url)
            img_name = '{}/'.format(title)+img_url.split('/')[-1]
            # print(img_name)
            headers = {
                # 'Cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c = 1534726766;Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c = 1534727069',
                'referer':url3 ,
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            }
            response = requests.get(img_url,headers=headers)
            with open(img_name,'wb') as f:
                f.write(response.content)
            print('%s    '%img_name)
    print('%s      ')

좋은 웹페이지 즐겨찾기