파충류 기어오르기 뉴스(二)

8860 단어
1판 뉴스를 뽑아서 가장 큰 느낌은 다음과 같다.
  • 몇 개 기어올릴 수 있는지 스스로 설정할 수 없음
  • 너무 느려
  • 이번에는 이 두 가지를 최적화해 보자.
    사용자가 입력한 숫자를 읽어서 각 채널의 뉴스 수량을 설정하고 다중 라인으로 속도를 최적화한다
      1 # -*- coding:utf-8 -*-
      2 
      3 import os,time
      4 import sys
      5 import urllib
      6 from urllib import request
      7 import re,threading
      8 from lxml import etree
      9 
     10 
     11 def StringListSave(save_path, filename, slist):
     12     if not os.path.exists(save_path):
     13         os.makedirs(save_path)
     14     path = save_path+"/"+filename+".txt"
     15     with open(path, "w+", encoding='GB18030') as fp:
     16         for s in slist:
     17             fp.write("%s\t\t%s
    " % (s[0], s[1])) 18 19 def CellPage(save_path, filename, slist,num=50): 20 ''' ''' 21 folder = save_path+'/'+filename 22 print (folder) 23 if not os.path.exists(folder): 24 os.mkdir(folder) 25 i = 0 26 for item, url in slist: 27 # 28 if i >= num:break 29 # windows 30 newitem = re.sub(r"[\/\\\:\*\?\"\<\>\|]","",item) 31 print (item) 32 with open(folder+'/'+newitem+'.html', "w+", encoding='GB18030') as fp: 33 PageContent = request.urlopen(url).read().decode("GB18030") 34 fp.write("%s
    " % PageContent) 35 i += 1 36 37 def Page_Info(myPage): 38 '''Regex''' 39 mypage_Info = re.findall(r'<div class="titleBar" id=".*?"><h2>(.*?)</h2><div class="more"><a href="(.*?)">.*?</a></div></div>', myPage, re.S) 40 return mypage_Info 41 42 def New_Page_Info(new_page): 43 '''Regex(slowly) or Xpath(fast)''' 44 dom = etree.HTML(new_page) 45 new_items = dom.xpath('//tr/td/a/text()') 46 new_urls = dom.xpath('//tr/td/a/@href') 47 assert(len(new_items) == len(new_urls)) 48 return zip(new_items, new_urls) 49 50 def Save(tuple1,i,save_path,num): 51 #print (tuple1) 52 item = tuple1[0] 53 url = tuple1[1] 54 print ("downloading ", url) 55 new_page = request.urlopen(url).read().decode("GB18030") 56 newPageResults = New_Page_Info(new_page) 57 filename = str(i)+"_"+item 58 StringListSave(save_path, filename, newPageResults) 59 newPageResults = New_Page_Info(new_page) 60 if num: # 61 CellPage(save_path, filename, newPageResults,num) 62 else: 63 CellPage(save_path, filename, newPageResults) 64 65 66 67 68 def Spider(url): 69 i = 0 70 num = input(" ( 50):") # 71 # 72 #num = '2' 73 if num.strip(): 74 num = int(num) 75 print ("downloading ", url) 76 myPage = request.urlopen(url).read().decode("GB18030") 77 myPageResults = Page_Info(myPage) 78 ntime = time.strftime("%Y%m%d",time.localtime(time.time())) 79 save_path = "news-" + ntime 80 filename = str(i)+"_"+"Ranking" 81 StringListSave(save_path, filename, myPageResults) 82 i += 1 83 84 85 list = myPageResults 86 threads = [] 87 files = range(len(list)) 88 # 89 for i in files: 90 t = threading.Thread(target=Save,args=(list[i],i,save_path,num)) 91 threads.append(t) 92 # 93 for i in files: 94 threads[i].start() 95 for i in files: 96 threads[i].join() 97 98 99 100 if __name__ == '__main__': 101 print ("start") 102 start_url = "http://news.163.com/rank/" 103 Spider(start_url) 104 print ("end")

    좋은 웹페이지 즐겨찾기