python 두 정 망 문서 다운로드

4053 단어 python
두 정 망 문 서 는 자신의 형식 (docin) 이 있 기 때문에 다운로드 한 후에 복호화 해 야 합 니 다...
common. py 파일http://blog.csdn.net/qq506657335/article/details/20004903
docinDownloader.py
import re;
import os;
import common;
import docinParse;

reg_getDocinPageID = re.compile("http://www\.docin\.com/p-(\d+)\.htm");

def getPageID(url):
    try:
        return reg_getDocinPageID.findall(url)[0];
    except:
        return None;

class docinDownloader():
    def __init__(self):
        pass;
        
    def download(self, url, savePath = "./tmpDocin"):
        common.createDir(savePath);
        self._pageID = getPageID(url);
        self._title = common.getTitle(url).replace(" -    ", "");
        for i in range(1, 100):  #             ,          。。
            filename = "{0}/{1}_{2}.docin".format(savePath, self._title, i);
            if(i == 1):
                common.urlDownloadToFile("http://221.122.117.125/docin_{0}.docin".format(self._pageID), filename);
            else:
                common.urlDownloadToFile("http://221.122.117.125/docin_{0}_{1}.docin".format(self._pageID, i), filename);
            #     。。                    ERROR          。。
            #     1024     。。。
            if(os.path.getsize(filename) < 1024):   
                os.remove(filename);
                return;
        
def main():
    downloader = docinDownloader();
    downloader.download("http://www.docin.com/p-760258140.html&ccid=100003");
    
if(__name__ == "__main__"):
    main();
    

docinParse.py
"""
      :
        http://blog.csdn.net/lin379184514/article/details/5305061  #mfc  
        http://blog.csdn.net/kowity/article/details/6342925        #python  (    )
        http://blog.sina.com.cn/s/blog_6859df370100wsv2.html       #swf    
        http://blog.csdn.net/jgood/article/details/4608546         #zlib  
"""

import struct;
import zlib;
import common;

def getDocinDocInfo(url):
    docID = reg_getDocinDocID.findall(url)[0];
    docName = reg_getDocTitle.findall(urllib.request.urlopen(url).read().decode())[0];
    return docID, docName;

class docinParse():
    def __init__(self):
        #       :http://blog.sina.com.cn/s/blog_6859df370100wsv2.html
        self._swfCommon = struct.pack('bbbb', 0x46, 0x57, 0x53, 9);
        
    def _getSwfInfo(self, docinFile):
        self._swfWidth = struct.unpack("i", docinFile.read(4))[0];  #  swf     
        self._swfHeight = struct.unpack("i", docinFile.read(4))[0]; #  swf     
        self._swfPages = struct.unpack("i", docinFile.read(4))[0];  #  swf     
        self._swfHeaderLength = struct.unpack("i", docinFile.read(4))[0];#  swf      
        
    def parse(self, filename, startIndex = 1, savePath = "./tmpSwf"):
        common.createDir(savePath);
        docinFile = open(filename, "rb");
        self._getSwfInfo(docinFile);
        
        swfHeader = zlib.decompress(docinFile.read(self._swfHeaderLength));
        
        for page in range(startIndex, startIndex + self._swfPages):
            byteBodyLen = docinFile.read(4);
            """
                      50 , 
                     N  , 
                   50 ,(        50 )
                          .
            """
            if(byteBodyLen == b""):  
                return;
            bodyLen = struct.unpack("i", byteBodyLen)[0];
            swfBody = zlib.decompress(docinFile.read(bodyLen));
            swf = swfHeader + swfBody;
            
            file = open("{0}/{1}.swf".format(savePath, page), "wb");
            file.write(self._swfCommon + struct.pack("i", len(swf)) + swf);
            file.close();
            
        
        
def main():
    docinParser = docinParse();
    docinParser.parse("docin_760258140.docin");
    
if(__name__ == "__main__"):
    main();

프로 세 스 캡 처 안 할 게 요...

좋은 웹페이지 즐겨찾기