파충류 예 - 여러 페이지, 함수 템 플 릿

11107 단어 WebSpider
주소: http://bj.xiaozhu.com/ 정보 포함: 여러 페이지;각 페이지 24 개 링크 오 르 기 요구: 모든 링크 의 제목, 주소, 가격, 이미지 링크, 주인 이름, 주인 성별
from bs4 import BeautifulSoup
import requests

def get_info(page_number):
    urls = get_page_link(page_number)
    for url in urls:
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text,'html.parser')

        title = soup.title.text
        address = soup.select('div.pho_info > p')[0].get('title')
        price = soup.select('div.day_l > span')[0].text
        pic = soup.select('#curBigImage')[0].get('src')
        host_name = soup.select('a.lorder_name')[0].text
        host_gender = soup.select('div.member_pic > div')[0].get('class')[0]

        def print_gender(class_name):#   、 
            if class_name == 'member_ico1':
                return ' '
            if class_name == 'member_ico':
                return ' '

        data = {
            'title':title,
            'address':address,
            'price':price,
            'pic':pic,
            'host_name':host_name,
            'host_gender':print_gender(host_gender)
        }
        print(data)

def get_page_link(page_number):#      
    page_link = []#    ,             
    for each_number in range(1, page_number):#  24   
        full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(each_number))
        wb_data = requests.get(full_url)
        soup = BeautifulSoup(wb_data.text,'html.parser')
        for url in soup.select('a.resule_img_a'):
            page_link.append(url.get('href'))
    return page_link#  :print        !!!        :return print(page_link)
get_info(page_number)#         

부분 출력:
{'title': '                      -       |    -  ', 'host_name': 'zoehh', 'price': '398', 'address': '           ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,61,6262,1800,1200,05be8a2a.jpg', 'host_gender': ' '}
{'title': '【  】  Soho    &       -       |    -  ', 'host_name': 'Liicy', 'price': '285', 'address': '          ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,56,6219,1800,1200,27d55c0f.jpg', 'host_gender': ' '}
{'title': '               -       |    -  ', 'host_name': '  ', 'price': '395', 'address': '            ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,62,2913,1800,1200,4ecf03a3.jpg', 'host_gender': None}
{'title': '  6            -       |    -  ', 'host_name': '      ', 'price': '197', 'address': '      6           ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/4,0,26,6729,1800,1200,768006fe.jpg', 'host_gender': ' '}
{'title': '              -       |    -  ', 'host_name': '  ', 'price': '596', 'address': '                ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,3,992,1800,1200,32297300.jpg', 'host_gender': ' '}
{'title': '          ,           -       |    -  ', 'host_name': 'zhengfanwu', 'price': '998', 'address': '           ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,99,6792,1800,1200,65ec7d49.jpg', 'host_gender': ' '}
{'title': '【     】   10          -       |    -  ', 'host_name': '    ', 'price': '228', 'address': '             (        500 ,      )', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/1,0,94,4002,825,550,d2a2390e.jpg', 'host_gender': ' '}
{'title': '      -    6  -       |    -  ', 'host_name': '     ', 'price': '108', 'address': '             ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,5,3548,1800,1200,9f2e73e2.jpg', 'host_gender': ' '}
{'title': '    ,    5  ,       -       |    -  ', 'host_name': '  ', 'price': '395', 'address': '               ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/6,0,66,803,1800,1200,38a4c686.jpg', 'host_gender': None}
{'title': '     798、  、         。-       |    -  ', 'host_name': '   Sunny', 'price': '268', 'address': '         ', 'pic': 'http://image.xiaozhustatic1.com/00,800,533/2,0,71,458,1800,1200,a9c5ea82.jpg', 'host_gender': None}

좋은 웹페이지 즐겨찾기