Python 3 --- 파충류 의 처리 데이터

6387 단어 Python3------파충
1. 정규 표현 식 re 를 통 해 데 이 터 를 처리 합 니 다.
정규 표현 식 규칙
from urllib import request,parse
import re

class Spider:
    def __init__(self):
        #         
        self.page = 1
        #     ,   True    
        self.switch = True
        pass

    def loadPage(self):
        """
                
        :return:
        """
        print("      .....")
        url = "http://www.neihan8.com/article/list_5_" + str(self.page) + ".html"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
        }
        req = request.Request(url,headers=headers)
        res = request.urlopen(req)
        html = res.read().decode('gbk')
        #print(html)

        pattern = re.compile('(.*?)
',re.S)
content_list = pattern.findall(html)
self.dealPage(content_list)
def dealPage(self,content_list):
"""
각 페이지 의 단락 을 처리 하 다
:return:
"""
for item in content_list:
item = item.replace("","").replace("","").replace("","").replace("","")
print ("데이터 기록 중...")
self.writePage(item)
def writePage(self,item):
"""
각 단락 을 하나하나 파일 에 기록 하 다.
:return:
"""
with open("duanzi.txt","a",encoding="gbk") as f:
f.write(item)
def startWork(self):
"""
파충류 의 운행 을 통제 하 다.
:return:
"""
while self.switch:
command = input ("계속 기어 오 르 면 차 로 돌아 가 십시오 (입력 quit 를 종료 합 니 다)")
if command == "quit":
self.switch = False
else:
self.loadPage()
self.page += 1
if __name__ == "__main__":
spider = Spider()
# spider.loadPage()
spider.startWork()
 
2. XPath 를 통 해 데 이 터 를 처리 합 니 다.
 
XPath 사용 상세 참조: XPath --- 용법 총화 정리
import os
from urllib import request,parse
from lxml import etree

class Spider:
    def __init__(self):
        self.tiebaName = input("        :")
        self.beginPage = int(input("      :"))
        self.endPage = int(input("      :"))

        self.url = 'http://tieba.baidu.com/f'
        self.ua_header = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}

        #     
        self.userName = 1

    def tiebaSpider(self):
        for page in range(self.beginPage, self.endPage + 1):
            pn = (page - 1) * 50 # page number
            word = {'pn' : pn, 'kw': self.tiebaName}

            word = request.urlencode(word) #   url    (   )
            myUrl = self.url + "?" + word

            #   :http://tieba.baidu.com/f? kw=%E7%BE%8E%E5%A5%B3 & pn=50
            #           load_Page
            #             ,
            links = self.loadPage(myUrl)  # urllib2_test3.py

    #       
    def loadPage(self, url):
        req = request.Request(url, headers = self.ua_header)
        html = request.urlopen(req).read()

        #   html   HTML   
        selector=etree.HTML(html)

        #            url     ,       
        # http://tieba.baidu.com/p/4884069807   “p/4884069807”
        links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')

        # links     etreeElementString   
        #     ,           ,          loadImage
        for link in links:
            link = "http://tieba.baidu.com" + link
            self.loadImages(link)

    #     
    def loadImages(self, link):
        req = request.Request(link, headers = self.ua_header)
        html = request.urlopen(req).read()

        selector = etree.HTML(html)

        #             src  
        imagesLinks = selector.xpath('//img[@class="BDE_Image"]/@src')

        #         ,    
        for imagesLink in imagesLinks:
            self.writeImages(imagesLink)

    #       
    def writeImages(self, imagesLink):
        '''
              images            userNname    
        '''

        print(imagesLink)
        print("       %d ..." % self.userName)
        # 1.     ,        
        file = open('./images/' + str(self.userName)  + '.png', 'wb')

        # 2.         
        images = request.urlopen(imagesLink).read()

        # 3.       write()   , page_html         
        file.write(images)

        # 4.       
        file.close()

        #      1
        self.userName += 1

#    main   
if __name__ == "__main__":

    #         
    mySpider = Spider()
    #          ,    
    mySpider.tiebaSpider()

 
3. BeautifulSoup 4 를 통 해 데 이 터 를 처리 합 니 다.
 
BeautifulSoup 4 사용 설명: Python 3 --- BeautifulSoup 4 용법 요약from bs4 import BeautifulSoup from urllib import request import json # json def tencent(): url = 'http://hr.tencent.com/' req = request.Request(url + 'position.php?&start=10#a') response =request.urlopen(req) resHtml = response.read() output =open('tencent.json','w') html = BeautifulSoup(resHtml,'lxml') # CSS result = html.select('tr[class="even"]') result2 = html.select('tr[class="odd"]') result += result2 items = [] for site in result: item = {} name = site.select('td a')[0].get_text() detailLink = site.select('td a')[0].attrs['href'] catalog = site.select('td')[1].get_text() recruitNumber = site.select('td')[2].get_text() workLocation = site.select('td')[3].get_text() publishTime = site.select('td')[4].get_text() item['name'] = name item['detailLink'] = url + detailLink item['catalog'] = catalog item['recruitNumber'] = recruitNumber item['publishTime'] = publishTime items.append(item) # ascii , utf-8 line = json.dumps(items,ensure_ascii=False) output.write(line) output.close() if __name__ == "__main__": tencent()

좋은 웹페이지 즐겨찾기