Python 목록 처리 웹 페이지 표 데이터

11228 단어 python
지식 에 관련되다
  • 정규 표현 식 re
  • 목록 처리
  • 코드 는 다음 과 같다.
    import requests
    import sys, io
    import re
    
    url = "http://www.nifdc.org.cn/CL0903/11390.html"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
    'Referer':'',
    }
    
    html_doc = requests.get(url=url, headers=headers).content
    
    #       
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    #       
    html_doc = html_doc.decode("gb18030")
    # print(html_doc)
    
    #            
    # pat_num = re.compile(r">((B|C)?2018[A-Za-z0-9]{5,6})
    # # |R01[0-9]{4}|P3K7[1,2]1M|YHBV[A-Za-z0-9]{5,6}
    # lst_num = pat_num.findall(html_doc)
    # # lst_num = re.search(r">((B|C)?2018[A-Za-z0-9]{5,6})
    
    # print(lst_num)
    pat = re.compile(r'(.+?)')
    lst = pat.findall(html_doc)
    # print(len(lst))
    # print('table_data:
    ', lst)
    # print('*' * 120) serial_no_lst = [] for i in range(150): serial_no = lst.index(str(i+1)) # if lst[serial_no-1] == " ": serial_no_lst.append(serial_no) # print(len(serial_no_lst)) # print(serial_no_lst) # print('*' * 120) # print(lst.count(" ")) # print(lst.index(" ")) # count = 0 while count+1 < 150: if serial_no_lst[count] >= serial_no_lst[count + 1]: # print('the %d number is not in order' % (count+1)) pass # count += 1 else: # print('ok--%d' % (count+1)) pass # count += 1 count += 1 # print('4th: ', serial_no_lst[4]) # print('95th: ', serial_no_lst[95]) # print('*' * 120) # lst_ck = [] # for item in serial_no_lst: # elem = lst[item] # lst_ck.append(elem) # print('lst_ck--%s:
    %s' % (len(lst_ck),lst_ck))
    serial_no_update_lst = serial_no_lst[:] serial_no_update_lst[4] = int((serial_no_update_lst[3] + serial_no_update_lst[5])/2) serial_no_update_lst[95] = int((serial_no_update_lst[94] + serial_no_update_lst[96])/2+1) # print(' 1-150 :
    ', serial_no_update_lst)
    # print('4th: ', serial_no_update_lst[4]) # print('95th: ', serial_no_update_lst[95]) # print('*' * 120) # print('lst[81]:', lst[81]) # print('lst[1594]:', lst[1594]) print('*' * 120) # lst serial_no_update_lst table_row_lst = [] title_row = lst[:serial_no_update_lst[0]] table_row_lst.append(title_row) for i in range(149): tmp = lst[serial_no_update_lst[i]:serial_no_update_lst[i+1]] table_row_lst.append(tmp) last_row = lst[serial_no_update_lst[149]:] table_row_lst.append(last_row) print('table_row_lst:
    '
    , table_row_lst)

    좋은 웹페이지 즐겨찾기