여동생 그림 파충류,가장 중요 한 것 은 헤더 에'Referer'설정 을 요청 하 는 것 입 니 다.http://www.mzitu.com/'

9524 단어 python

1 웹 소스 코드 가 져 오기 및 http 요청 헤더 설정

import requests
from bs4 import BeautifulSoup
import os

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
                        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}

url = 'http://www.mzitu.com/all/'

start_html = requests.get(url,headers = headers)   #    
# print(start_html.text)            #

2 BeautifulSoup 모듈 분석 페이지

Soup = BeautifulSoup(start_html.content,'lxml')

      
with open('yuanma.html','wb') as f:
    #f.write(Soup .text)   #       ，     #

    ：
TypeError: a bytes-like object is required, not 'str'
    ： str  byte  
    ： a = bytes(Soup.text,'utf-8')

3.분석 후 해당 하 는 탭 을 가 져 오고 폴 더 를 만 듭 니 다.

all_a = Soup.find('div',class_='all').findAll('a')
for a in all_a:

    # print(a,type(a))   #'bs4.element.Tag'>  
    title = a.text    #   title = a.get_text()
    path = str(title).strip()
    os.makedirs(os.path.join(path))

    ：NotADirectoryError: [WinError 267]       。: ' -KiKi:    ,           '

    ：         
     ：
n=0
for a in all_a:
    n += 1
    m = str(n)
    # print(a,type(a))   #'bs4.element.Tag'>  
    title = a.get_text()   #   title = a.get_text()
    title = title.replace('?',' ')
    title = title.replace(':',' ')
    path = str(title).strip()
    try:
        os.makedirs(os.path.join(all_path,path))
    except NotADirectoryError and OSError as e:
        print(e)
        os.makedirs(os.path.join(all_path,'    '+m))

4 탭 의 단일 페이지 가 져 오기

href = a['href']

5 단일 페이지 의 원본 코드 를 가 져 오고 분석 합 니 다.12 단계 와 유사 합 니 다.
6.여동생 그림 요청 헤더 추가:

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
                        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
           ,'Referer':'http://www.mzitu.com/',}
# Referer

최종 코드:

import requests
from bs4 import BeautifulSoup
import os

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
                        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
           ,'Referer':'http://www.mzitu.com/',}

url = 'http://www.mzitu.com/all/'

start_html = requests.get(url,headers = headers)   #    
# print(start_html.text)            #      

Soup = BeautifulSoup(start_html.content,'lxml')
all_a = Soup.find('div',class_='all').findAll('a')

# print(os.getcwd())
all_path = os.getcwd()+'\\tupian\\'
# print(all_path)


b = 0
for a in all_a:
    # print(a,type(a))   #  
    title = a.get_text()   #   title = a.get_text()
    title = title.replace('?',' ')
    title = title.replace(':',' ')
    path = str(title).strip()
    # print('   ')
    try:
        os.makedirs(os.path.join(all_path,path))   #os.path.join             
        os.chdir(all_path+path)
        b += 1
    except FileExistsError:
        continue
    except NotADirectoryError and OSError as e:
        print(e)
        break
    # print('  ')
    href = a['href']
    print(href)
    html = requests.get(href,headers=headers)
    html_Soup = BeautifulSoup(html.text, 'lxml')
    max_span = html_Soup.find('div', class_='pagenavi').find_all('span')[-2].get_text()   #          
    count = 0
    for page in range(1,int(max_span)+1):
        page_url = href + '/' +str(page)
        img_html = requests.get(page_url,headers = headers)
        #    
        img_Soup = BeautifulSoup(img_html.content,'lxml')



        tupian = img_Soup.find('div',class_='main-image').find('img')['src']
        name = tupian[-9:-4]
        tp = requests.get(tupian,headers=headers)
        with open(name + '.jpg', 'ab') as f:
            f.write(tp.content)
        count +=1
        if count >= 1:   #            
            break
    if b >= 3:    #       ，       
        break

# x = requests.get('http://images2015.cnblogs.com/blog/140867/201601/140867-20160103115154339-792142004.png',headers=headers)
# print(x.content)
# f = open('1'+'.jpg','ab')
# f.write(x.content)
# f.close()

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

로마 숫자를 정수로 또는 그 반대로 변환

그 중 하나는 로마 숫자를 정수로 변환하는 함수를 만드는 것이었고 두 번째는 그 반대를 수행하는 함수를 만드는 것이었습니다. 문자만 포함합니다'I', 'V', 'X', 'L', 'C', 'D', 'M' ; 문자열이 ...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

좋은 웹페이지 즐겨찾기

개발자 우수 사이트 수집

개발자가 알아야 할 필수 사이트 100선 추천 우리는 당신을 위해 100개의 자주 사용하는 개발자 학습 사이트를 정리했습니다