노인보호구역 크롤링

노인보호구역의 시민들 인식을 알아보기위해 네이버 블로그 크롤링을 진행하였다.

  1. 필요한 패키지와 폰트를 로드해 주었다.
import os
import sys
import urllib.request
import json
import re
import konlpy
from konlpy.tag import Okt
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import konlpy
from konlpy.tag import Okt
from collections import Counter
import pandas as pd
import konlpy.tag
  1. 블로그 크롤링하기 위해 네이버 API 사용(1000 까지 검색)
    가져온 데이터를 저장하고 csv파일로 만드는 것 까지 수행
def get_blog():
    keyword = input('검색어를 입력하세요:')
    client_id = ""
    client_secret = ""

    encText = urllib.parse.quote(keyword)

    tlist = []
    llist = []
    dlist = []



    for pagenum in range(1,1000,100):

        try:

            url = "https://openapi.naver.com/v1/search/blog?query=" + encText +"&display=100&sort=sim&start="+str(pagenum)
            # url = "https://openapi.naver.com/v1/search/blog.xml?query=" + encText # xml 결과
            request = urllib.request.Request(url)
            request.add_header("X-Naver-Client-Id",client_id)
            request.add_header("X-Naver-Client-Secret",client_secret)
            response = urllib.request.urlopen(request)
            rescode = response.getcode()
            if(rescode==200):
                response_body = response.read()
                print(response_body.decode('utf-8'))
                jtemp = response_body.decode('utf-8')
                jdata = json.loads(jtemp)
                jdata['items']
                
                for temp in jdata['items']:  
                    hangul = re.compile('[^ ㄱ-ㅎ|가-힣]+')
                    tdata = temp['title']
                    ldata = temp['link']
                    ddata = hangul.sub(r'',temp['description'])

                    tlist.append(tdata)
                    llist.append(ldata)
                    dlist.append(ddata)
                

            else:
                print("Error Code:" + rescode)

        except:
            print('Error')



    result = []
    for temp in range(len(tlist)):
        temp1 = []
        temp1.append(tlist[temp])
        temp1.append(llist[temp])
        temp1.append(dlist[temp])

        result.append(temp1)


    f = open('{0} - 네이버API 블로그검색.csv'.format(keyword) , 'w', encoding='utf-8')
    f.write('제목'+',' + '링크' + ',' + '내용' + '\n')
    for temp in result:    
        f.write(temp[0] + ',' + temp[1] + ',' + temp[2] +'\n')
    f.close()
    
    return result
  1. 한글을 제외한 나머지 문자들을 제거해주는 함수
def clean_str(s):
    hangul = re.compile('[^ㄱ-ㅎ|가-힣]+')
    s = hangul.sub(r' ',s)
    
    cp = re.compile("["
                     u"\U00010000-\U0010FFFF"
                     "]+", flags=re.UNICODE)
    s = cp.sub(r' ',s)
    
    return s.strip()
  1. 크롤링된 데이터에서 설명 부분만 가져와서 저장
def get_text(data):
    result_text = ''
    for temp in data:
        result_text = result_text +' ' +  temp[2]

    return result_text
  1. 워드클라우드를 사용하기 위한 함수
def Wordcloud(data , savename , maskname=''):
    noun_text = ''
    for word in data:
        noun_text = noun_text +' ' + word
    
    if maskname == '':
        wc = WordCloud( font_path='../data/WordCloud/font/BMEULJIROTTF.ttf' , background_color='white', max_font_size=60, colormap='PuOr_r').generate(noun_text)
    else:
        maskimg = np.array(Image.open(maskname))
        wc = WordCloud(font_path='../data/WordCloud/font/BMEULJIROTTF.ttf' , background_color='white', mask=maskimg, max_font_size=60, colormap='PuOr_r').generate(noun_text)
        
#     wc.generate_from_frequencies(data)
    plt.figure(figsize=(20,10))
    plt.imshow(wc)
    plt.tight_layout(pad=0)
    plt.axis('off')
    plt.show()
    wc.to_file('../data/크롤링/savename' + '.png')
  1. 크롤링 실행
bdata = get_blog()
rtext = get_text(bdata)
  1. 부정어사전 추가
import pandas as pd
n = pd.read_csv('../data/크롤링/사전/neg_pol_word.csv',sep='\n')
nag = []
for i in n['0']:
    nag.append(i)
  1. 긍정어사전 추가
p = pd.read_csv('../data/크롤링/사전/pos_pol_word.csv',sep='\n')
pos = []
for i in p['0']:
    pos.append(i)
  1. 불용리스트 만들어서 사전에 추가
stopwords = ['실버존','스쿨존','보호','구역','노인','어린이','등','교통','곳','지정','안전','사업','일','시설','위해','및','년','장애인','위','경로당','설치','주변',
            '교통사고','보행자','보행','환경','도로','시','유치원','이번','중','조례','복지','차량','올해','애인','개','개소','억','관내',
            '의원','전국','발생','추가','이','물','월','원','확대','내','현재','광주','리','지역','최근','대전','것','지난해','초등학교','관','인구',
            '존','완료','대한','투입','공단','기자','대해','지난','마을','로','수','총','시행','조성','추진','공원','경찰정','경찰','충남',
            '회','또','군','신규','계획','안','광주시','시스템','생활','활동','중구','어르신','윤','실버','서울시회관','경우','통행','기준','어린이집',
            '진행','구','경기도','만','윤','복지관','아산시','대책','기관','인근','행사','점검','부과','지원','대폭','도시','억원','횡단보도','전통','도',
            '서울시','시장','회관','운영','가운데','민주당','공사','제조','요양원','개정','이상','규칙','고','지자체','앞','억만원','운전','출처',
            '학교','일자리','만원','도로교통법','미끄럼','주간','주민','금','일반','표시','센터']


for i in stopwords:
    pos.append(i)
  1. 단어들을 태그로 분리
def get_tags(text, ntags=50):
    spliter = Okt()
    nouns = spliter.nouns(text)
    count = Counter(nouns)
    words = dict(count.most_common(ntags))
    
    #불용어 사전안에 있으면 삭제
    for i in pos:
        if i in words:
            del words[i]
    return words

  1. 태그를 200개로 제한
rtags = get_tags(rtext, ntags=200)
  1. 워드클라우드 실행
Wordcloud(rtags, '노인보호구역','../data/WordCloud/mask/mask1.jpeg')


출처

네이버 API(https://developers.naver.com/)
한국어 감성사전(https://github.com/park1200656/KnuSentiLex)
mask이미지(https://www.google.com/imgres?imgurl=https://thumb.silhouette-ac.com/t/fb/fbd36cb62b2a60bf4e6f81ebb957d6a5_t.jpeg&imgrefurl=https://ac-illust.com/ko/clip-art/91341/%25EA%25B1%25B7%25EB%258A%2594-%25EC%2582%25AC%25EB%259E%258C%25EB%2593%25A4&h=340&w=340&tbnid=eBfIMkikIdDYUM&tbnh=225&tbnw=225&usg=AI4_-kSfCvv5LXvOkE4Ei5iXsr2MhxMvqA&vet=1&docid=doC7wTIPoixcYM&itg=1&hl=ko)

좋은 웹페이지 즐겨찾기