python 중국어 문자 처리 경험

3216 단어 python
#!/usr/bin/env python
#-*- coding:utf-8 -*-

import sys, os
import md5

destPath = r'h:\  A\  '
srcPath = r'h:\  B\  '
rstPath = r'h:\  C\rst.txt'

#----------------------------------------------------------------------
def find_all_files(path):
    '''
    '''
    print '\r\r'
    files = os.listdir(path.decode('utf8'))
    fileslist = []
    for ff in files:
        ffPath = path + '\\' + ff
        print ffPath,
        if os.path.isfile(ffPath):
            fileslist.append(ffPath)
            print 'file'
        elif os.path.isdir(ffPath):
            print 'dir'
            fileslist += find_all_files(ffPath)
        else:
            print 'parse error!', '\t', ffPath
    return fileslist

#----------------------------------------------------------------------
def md5_list(path):
    '''
    '''
    filesList = find_all_files(path)
    filesMd5 = {}
    for ff in filesList:
        try:
            fp = open(ff, 'rb')
            m = md5.md5()
            strRead = ""
            while True:
                strRead = fp.read(8096)
                if not strRead:
                    break
                m.update(strRead)
            strMd5 = m.hexdigest()
            filesMd5[strMd5] = ff
            fp.close()
        except Exception, ex:
            print ex
            fp.close()
    
    return filesMd5

if __name__=='__main__':
    reload(sys)
    sys.setdefaultencoding('utf-8')
    print 'Begin.......'   

    srcFilesMd5 = md5_list(srcPath)
    destFilesMd5 = md5_list(destPath)
    
    rst = ''
    for key in srcFilesMd5.keys():
        if key not in destFilesMd5.keys():
            fileName = srcFilesMd5[key]
            rst = rst + fileName.encode('utf8') + '\r'
    
    fp = open(rstPath, 'w')
    fp.write(rst)
    fp.close()
    
    print '
Run Over......'
                ,                       ,       rstPath 。
                               。          ,          ,      decode() encode()   。         ,         。
              ,python          ASCII  , string  ,ASCII              。python        unicode,      ‘u’        unicode   ,  u'hello'  unicode  。
            ascii      ,     ,    unicode   。      :
decode(),             unicode  , str1.decode('gb2312'),   gb2312      str1   unicode  ;
encode(), unicode             , str2.encode('gb2312'),   unicode      str2   gb2312  ;
unicode(), decode(),            unicode  , unicode(str3, 'gb2312'),   gb2312      str3   unicode  。
               str     ,  decode unicode,   encode     。
        ,   unicode              ,                       unicode,   isinstance(str, unicode)。
           ,      ascii       ,         :
1、          ,   utf8;
2、  unicode() decode()   unicode  , str1.decode('utf8'),  unicode(str1, 'utf8');
3、        encode()       。

좋은 웹페이지 즐겨찾기