cnblogs 블 로그 다운로드 - cnblogs 블 로그 내 보 내기 - cnblogs 블 로그 백업 도구

http://blog.csdn.net/infoworld/article/details/19547723
다음 코드 는 infoWorld 의 csdn 백업 python 코드 를 기반 으로 수 정 된 cnblogs 블 로그 백업 이지 만 infoWorld 의 인터페이스 와 일치 하지 않 아 python 에 만 사용 할 수 있 습 니 다.python 은 정말 재 미 있 습 니 다. 개발 이 빠 르 군요. 어쩐지 이렇게 유행 하 더 라 니.
#! encoding=utf-8

#cnblogs    ，    ：      url output，        。

import urllib2
import re
import os
import sys
# from HTMLParser import HTMLParser
import html5lib
# from xml.etree.ElementTree import ElementTree
from urlparse import urlparse
import xml
import codecs
import traceback
import time

# class MyHTMLParser(HTMLParser):

#     def handle_starttag(self, tag, attrs):
#         # if tag.lower() == "img":
#             print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
#             for x in attrs:
#                 print "name %s,value %s" % (x[0],x[1])
#     def handle_endtag(self, tag):
#         print "Encountered the end of a %s tag" % tag

#     def handle_startendtag(self, tag, attrs):
#         print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
#         for x in attrs:
#             print "name %s,value %s" % (x[0],x[1])

#       
gTestTime = 5

def DownloadFile(url,output):
    responseText = None
    dirssPath = None
    try:
        res = urlparse(url)
        url = res.scheme+"://"+res.netloc+res.path
        path = res.path
        index = path.rfind('/')
        dirss = "/"
        if index != -1:
            dirss =  output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")
            dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")
            dirss_ansi = dirss.decode('utf-8')
            if not os.path.exists(dirss_ansi):
                os.makedirs(dirss_ansi)
        global gTestTime
        count = gTestTime    
        while True:
            if count < 0:
                break
            count = count - 1
            header={"User-Agent": "Mozilla-Firefox5.0"}
            if not url.startswith("http://"):
                break
            try:
                # print "url: %s:%d" % (url,count)
                time.sleep(0.5)
                request = urllib2.Request(url,None,header)
                response = urllib2.urlopen(request)
                dirssPath_ansi = dirssPath.decode("utf-8")
                if not os.path.exists(dirssPath_ansi):
                    resourceFile = open(dirssPath_ansi,"wb")
                    responseText = response.read()
                    if url.endswith(".js"):
                        responseText = responseText.replace("http://","")
                        responseText = responseText.replace("https://","")
                    resourceFile.write(responseText)
                    resourceFile.close()
                break         
            except Exception,e:
                print "DownloadFile: %s:%s:%d" % (e,url,count)
                # pass
                # exstr = traceback.format_exc()
                # print exstr

    except Exception,e:
            pass
            # exstr = traceback.format_exc()
            # print exstr
    
    return (responseText,url,output)

def ReadCss(css):
    # print "ReadCss"
    mode = 'url\(\"?([^)]+)\"?\)'
    pattern = re.compile(mode)
    try:
        text = css[0]
        if css[0] == None:
            return
        strMatch = pattern.findall(text)
        size = len(strMatch)
        # print "size: ",size
        for i in range(0,size,1):
            one = strMatch[i]
            newurl = GetConcatUrl(css[1],one)
            DownloadFile(newurl,css[2])
    except Exception,e:
            pass
            # exstr = traceback.format_exc()
            # print exstr 

def Download(url,output):
    # try:
    header={"User-Agent": "Mozilla-Firefox5.0"}
    namespace = "{http://www.w3.org/1999/xhtml}"
    request = urllib2.Request(url,None,header)
    response = urllib2.urlopen(request)

    data = response.read()
    document = html5lib.parse(data)
    imgElements = document.findall('.//{0}img'.format(namespace))
    # print "imgElements %d" % len(imgElements)
    for img in imgElements:
        src = img.attrib["src"]
        # print "src %s" % src
        try:
            res = urlparse(src)
            #  cnblogs      
            if not res.netloc.endswith(".cnblogs.com"):
                print "image not download: %s:%s" % (src,res.netloc)
                continue
        except Exception,e:
            pass
        DownloadFile(src,output)

    linkElements = document.findall('.//{0}link'.format(namespace))
    # print "linkElements %d" % len(linkElements)
    for link in linkElements:
        href = link.attrib["href"]
        # print "href %s" % href
        text = DownloadFile(href,output)
        if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":
            ReadCss(text)

    scriptElements = document.findall('.//{0}script'.format(namespace))
    # print "scriptElements %d" % len(scriptElements)
    for script in scriptElements:
        if script.attrib.has_key("src"):
            src = script.attrib["src"]
            # print "src %s" % src
            DownloadFile(src,output)
        
    htmlNameIndex = url.rfind("/");
    urlLen = len(url)
    htmlName = GetHtmlName(url)
    output = output.decode("utf-8") + "/"+htmlName+".htm"
    data = data.replace("http://","")
    data = data.replace("https://","")
    data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")

    resourceFile = open(output,"wb")
    resourceFile.write(data)
    resourceFile.close()

def GetConcatUrl(url,png):
    # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css
    count = 0
    index = png.find("..")
    startindex = None
    while index != -1:
        count = count + 1;
        startindex = index + 2
        index = png.find("..",startindex)

    second = png[startindex:]
    length = len(url)
    index = url.rfind("/")
    endindex = 0
    while count >= 0 and index != -1:
        endindex = index
        index = url.rfind("/",0, endindex)
        count = count - 1
    first = url[0:endindex]
    return first+second

def getAllListUrl(url):
    header={"User-Agent": "Mozilla-Firefox5.0"}
    request = urllib2.Request(url,None,header)
    response = urllib2.urlopen(request)
    data = response.read()
    
    # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).
    document = html5lib.parse(data)
    namespace = "{http://www.w3.org/1999/xhtml}"

    # get 
    pageList = document.findall('.//{0}div[@id=\'homepage1_BottomPager\']'.format(namespace))
    # get 
    alinks = list(pageList[0])
    # get content in , like:alinks1 = list(alinks[0])
lastArticle = alinks1[len(alinks1)-1]
# lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20'
lastArticleHref = lastArticle.attrib["href"]
lastPageIndex = lastArticleHref.rfind("=")
lastPageNum = int(lastArticleHref[lastPageIndex+1:])
urlInfo = lastArticleHref[0:lastPageIndex]
urlList = []
for x in xrange(1,lastPageNum+1):
listUrl = urlInfo+"="+str(x)
urlList.append(listUrl)
return urlList
def getArticleList(url):
\ # 모든 글 url 가 져 오기
### 
##     
### 
urlList = getAllListUrl(url)
print "문장 페이지 수 (페이지 수)", len (url 목록)
header={"User-Agent": "Mozilla-Firefox5.0"}
allLists = []
strPage = "{0} 페이지 분석". decode ("utf - 8"). encode ("utf - 8")
pageNum = 0
global gTestTime
for one in urlList:
tryCount = gTestTime # try count
pageNum = pageNum + 1
pageNumStr = strPage.format(pageNum)
print pageNumStr
while tryCount > 0:
try:
tryCount = tryCount - 1
time. sleep (0.5) \ # 방문 이 너무 빠 르 면 응답 하지 않 습 니 다.
request = urllib2.Request(one,None,header)
response = urllib2.urlopen(request)
data = response.read()
document = html5lib.parse(data,encoding="utf-8")
namespace = "{http://www.w3.org/1999/xhtml}"
# .//{0}div[@id=\'article_toplist\']
#topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
#articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
articleLists =  document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace))
allLists = allLists + articleLists
break
except Exception, e:
print "getArticleList %s:%s:%d" % (e,one,tryCount)
count = 0 \ # 문장 수
artices = []
for article in allLists:
count = count+1
alink = article.find(".//{0}a".format(namespace))
# href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html'
href = alink.attrib["href"]
#oneHref = "http://blog.csdn.net"+href
oneHref = href
childElement = list(alink)
linkIter = alink.itertext()
title = "".encode("utf-8")
for x in linkIter:
title = title+x.strip().encode("utf-8")
artices.append([oneHref,title])
return artices
def GetUserName(url):
htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = ""
htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)
htmlName = url[htmlNameIndex1+1:htmlNameIndex]
# if htmlNameIndex+1 == urlLen:
# htmlNameIndex = url.rfind("/",0,htmlNameIndex)
# htmlName = url[htmlNameIndex+1:urlLen-1]
# else:
# htmlName = url[htmlNameIndex+1:]
return htmlName
def GetHtmlName(url):
htmlNameIndex = url.rfind("/");
urlLen = len(url)
htmlName = ""
if htmlNameIndex+1 == urlLen:
htmlNameIndex = url.rfind("/",0,htmlNameIndex)
htmlName = url[htmlNameIndex+1:urlLen-1]
else:
htmlName = url[htmlNameIndex+1:]
return htmlName
\ # url 은 유사 해 야 합 니 다.http://www.cnblogs.com/GnagWang/default.html?page=19이런 거.그리고 이 페이지 는 마지막 페이지 의 링크 를 포함해 야 합 니 다.예 를 들 어 GnagWang 이 모두 20 페이지 이면 URL 은 앞의 URL 로 권장 합 니 다.
def Start(url,output):
print "백업 시작"
lists = getArticleList(url)
username = GetUserName(url)
output_username = output+"/"+username
output_username.replace("\\","/")
if not os.path.exists(output_username.decode("utf-8")):
os.mkdir(output_username.decode("utf-8"))
totalNum = len(lists)
print "총 문장 수 (number of articles):% d"% totalNum
\ # 첫 페이지 파일 생 성
doctype = ''
charset = ''
indexHtml = output_username + ".htm" 
f = open(indexHtml.decode("utf-8"),"w")            
print >> f,doctype
print >> f,''
print >> f,''
print >> f,charset
print >> f,''
print >> f,''
navigationHtmlName = username+'-navigation.htm'
print >> f,''
firstHtmlName = GetHtmlName(lists[0][0])
print >> f,''
print >> f,''
print >> f,''
f.close()
\ # 탐색 파일 생 성
navigationHtml = output+"/"+navigationHtmlName
# f = open(navigationHtml.decode("utf-8"),"w")
f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")
print >> f,doctype
print >> f,''
print >> f,''
print >> f,charset
print >> f,' body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}'
print >> f,''
print >> f,''
count = 0
for x in lists:
count = count + 1
articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"
print >> f,''+str(count)+'.'+x[1].decode("utf-8")+'

'
    print >> f,''
    print >> f,''
    f.close()

    print "      "
    currentNum = 0
    strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")
    global gTestTime
    for x in lists:
        count = gTestTime
        currentNum = currentNum+1
        while True:
            if count < 0:
                break
            count = count - 1
            try:
                time.sleep(1) #    ,csdn  503  .
                strPageTemp = strPage.format(totalNum,currentNum)
                strPageTemp = strPageTemp+x[1]
                print strPageTemp #          , output is not utf-8  ,     

                print x[0]
                print "
"
                Download(x[0],output_username)
                break
            except Exception, e:
                # exstr = traceback.format_exc()
                # print exstr
                pass
    
    
#url     http://www.cnblogs.com/GnagWang/default.html?page=19   。               。  GnagWang 20 ， URL      URL   
if __name__=='__main__':
    url = "http://www.cnblogs.com/GnagWang/default.html?page=19"
    #output = "C:/Users/apple/Desktop/     "
    output = "f:/temp"
    Start(url,output)
    # Download("http://blog.csdn.net/dcraw/article/details/6858820",
    #     "C:/Users/apple/Desktop/     /infoworld")
이 가능 하 다, ~ 할 수 있다,...
github
jackyrong
github
ip (javascript)
alxw4616
JavaScript
-jquey+ +css
chengxuyuancsdn
html jquery oracle
javaSE javaEE javaME == API
Array_06
java
shiro
cugfy
java Web
Array
357029540
js
navigation bar
장 아 웅
IO
unicode
adminjun
unicode
cnblogs 블 로그 다운로드 - cnblogs 블 로그 내 보 내기 - cnblogs 블 로그 백업 도구 - python 기반

좋은 웹페이지 즐겨찾기