360 검색엔진 데이터 캡처

BeatifulSoup 웹 페이지 분석

#-*- coding:utf-8 -*-
#Filename:360    
#Author:Guan
#Datetime:2018/11/30

import requests
from bs4 import  BeautifulSoup
import json
import time

def get_html(url):
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
        "Cookie":"Q=u%3D360H3096670450%26n%3D%26le%3D%26m%3DZGZmWGWOWGWOWGWOWGWOWGWOZwHl%26qid%3D3096670450%26im%3D1_t01c37c6928fc149034%26src%3D360index%26t%3D1; T=s%3Dccd759892e1b135a3419e01e23177efa%26t%3D1542679014%26lm%3D%26lf%3D%26sk%3D34a25b6ef06eb6640f916a7ac7dd02ba%26mt%3D1542679014%26rc%3D%26v%3D2.0%26a%3D1; QiHooGUID=45A4F6333A9E13202FF582D464E8CB16.1543578963037; __guid=15484592.4007070523124616700.1543578964544.7502; webp=1; stc_ls_sohome=RGzW2OYRKV!3TRXVhIMSWA; __huid=11ZSgWXOw0Wun4Is5XEqKzQ7U4mjrXUxDivKDEINKN3pU%3D; gtHuid=1; dpr=1.25; count=3; _pp_wd=1; erules=p1-14%7Cecr-3%7Cp4-14%7Cp2-5%7Cp3-6"
    }
    response=requests.get(url=url,headers=headers).content.decode()
    # print(response)
    return response

def get_cont(html):
    soup = BeautifulSoup(html,'lxml')
    url_list = soup.select('.res-list')
    new_list  =[]
    #    
    for i in url_list:
        new_dict={}
        new_dict['title'] = i.find_all('h3')[0].text.strip()
        cont2 = i.select('div[class="res-rich so-rich-news clearfix"]')
        for j in cont2:
            # print(j.text.strip())
            new_dict['cont_two'] = j.text.strip()
        new_dict['cont_one'] = i.find_all('p')[0].text.strip()
        new_list.append(new_dict)
    # print(new_list)
    for str in new_list:
        new_cont = json.dumps(str,ensure_ascii=False)
        print(new_cont)
        with open('D:\\    \\.PyCharmCE2018.2\\config\\scratches\\  \\360  \\360      ','a',encoding='utf-8')as f:
            f.write(new_cont+'
')

if __name__ == '__main__':
    #   
    file = open('D:\\    \\.PyCharmCE2018.2\\config\\scratches\\  \\360  \\360      ')
    content = file.readlines()
    chexing = []
    for i in content:
        new_chexing = i.split(',')
        for j in new_chexing:
            chexing.append(j)
    #  
    for i in range(1,90):
        for j in chexing:
            url = 'https://www.so.com/s?q=%s'%j+'&pn=%d'%i
            print('    %s   %d    '%(j,i))
            html=get_html(url)
            time.sleep(2)
            get_cont(html)
    print('      ')
360 검색엔진 데이터 캡처

BeatifulSoup 웹 페이지 분석

좋은 웹페이지 즐겨찾기