[Crawling] BeautifulSoup Crawling

코리안넷 페이지 크롤링

바코드(88코드)에 따른 상품정보 크롤링

import requests
from bs4 import BeautifulSoup

def crawl_by_bs4(url):
    response = requests.get('http://gs1.koreannet.or.kr/pr/'+url)
    gbnlist = []
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        gbn  = soup.select_one('body > div > div.pv_title > table > tbody > tr:nth-child(2) > td')
        code1 = soup.select_one('body > div > div.pv_title > table > tbody > tr:nth-child(5) > td').text
        code2 = soup.select_one('body > div > div.pv_title > h3').text
        
        gbnlist = gbn.get_text().split('>')
        # 해당 바코드가 등록되어있지 않은 경우
        if gbnlist == ['']:
            gbnlist =['-', '-', '-', '-','-','-']
        else:
            gbnlist.insert(0,code2)
            gbnlist.insert(0,code1)
    else : 
        print(response.status_code)
    return gbnlist

크롤링한 정보 저장

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
csvfile = pd.read_csv('./hmr.csv', encoding='ansi')

def Refiner(X):
    result = []
    data = X.copy()
    data.columns = ['상품코드','상품명','CD_GBN','계층1','계층2','계층3','계층4','계층5','계층코드','계층명1','계층명2','계층명3','계층명4','계층명5','제조사코드','제조사명','수정일자','생성일자','중량']
    
    for i in range(len(data)):
        itemcode = np.string_(data.loc[i]['상품코드']).decode('UTF-8')
        hlist = crawl_by_bs4(itemcode)
        hlist.insert(0, itemcode)
        for j in range(1,len(data.loc[i])):
            hlist.append(data.loc[i][j])   
        result.append(hlist)
    return result

result = Refiner(csvfile)
df = pd.DataFrame(result)
df.to_csv('C:/Users/User/PycharmProjects/pypy/HMR바코드계층.csv',sep=',',encoding = 'UTF-8')

좋은 웹페이지 즐겨찾기