21.1.28
<노마드 코더 웹 스크래핑 수업 통해 웹 크롤링 배워보기2>
- indeed.py 및 main.py 정리
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = "https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit={LIMIT}"
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class" : "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_job(html):
title = html.find("h2", {"class" : "title"}).find("a")["title"]
company = html.find("span", {"class" :"company"})
company_anchor = company.find('a')
if company_anchor is not None:
company = str(company_anchor.string)
else:
company = str(company.string)
company = company.strip()
location = html.find("span", {"class" : "location"}).string
job_id = html["data-jk"]
return {'title': title, 'company' : company, "location" : location, "link" : f"https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit=50&vjk={job_id}"}
def extract_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
모든 자료 일단 indeed.py로 불러오기
from indeed import get_jobs as get_indeed_jobs
indeed_jobs = get_indeed_jobs()
print(indeed_jobs)
main.py 정리
- StackOverflow page 크롤링 시작(기본 세팅 ~ 페이지 수 추출, status_code 확인)
import requests
from bs4 import BeautifulSoup
URL = "https://stackoverflow.com/jobs?q=python&sort=i"
def get_last_page():
result = requests.get(URL)
soup=BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
last_page = pages[-2].get_text(strip = True)
return int(last_page)
def extract_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(f"{URL}&pg=page+1")
print(result.status_code)
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return
- BeautifulSoup 활용 직업 뽑아내기(number 먼저)
import requests
from bs4 import BeautifulSoup
URL = "https://stackoverflow.com/jobs?q=python&sort=i"
def get_last_page():
result = requests.get(URL)
soup=BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
last_page = pages[-2].get_text(strip = True)
return int(last_page)
def extract_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(f"{URL}&pg=page+1")
soup=BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "-job"})
for result in results:
print(result["data-jobid"])
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return
직업의 넘버를 먼저 뽑아냄.
그리고 꼭 find_all에서 상위 div만 뽑는단 생각 말고 직업이 있는데에서도 뽑고 위처럼 해보자.
- 직업 이름 뽑아내기
import requests
from bs4 import BeautifulSoup
URL = "https://stackoverflow.com/jobs?q=python&sort=i"
def get_last_page():
result = requests.get(URL)
soup=BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
last_page = pages[-2].get_text(strip = True)
return int(last_page)
def extract_job(html):
title = html.find("h2", {"class" : "mb4"}).find('a')["title"]
return {"title" : title}
def extract_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(f"{URL}&pg=page+1")
soup=BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "-job"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return
- 회사 이름 + 지역 이름크롤링
span 안에 회사 이름과 지역이 같이 들어있다.-> recursive = False로 깊게 들어가지 않게 리스트 요소 인덱스 2개로 나눠 변수에 넣어준다.
(다른 부분 수정 X)
def extract_job(html):
title = html.find("h2", {"class" : "mb4"}).find('a')["title"]
company, location = html.find("h3", {"class" : "mb4"}).find_all("span", recursive = False)
print(company.get_text(strip = True).strip('-'), location.get_text(strip = True))
return {"title" : title}
- 코드 정리 / 지원 링크 가져오기(마무리)
import requests
from bs4 import BeautifulSoup
URL = "https://stackoverflow.com/jobs?q=python&sort=i"
def get_last_page():
result = requests.get(URL)
soup=BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
last_page = pages[-2].get_text(strip = True)
return int(last_page)
def extract_job(html):
title = html.find("h2", {"class" : "mb4"}).find('a')["title"]
company, location = html.find("h3", {"class" : "mb4"}).find_all("span", recursive = False)
company = company.get_text(strip = True)
location = location.get_text(strip = True).strip('-')
job_id = html["data-jobid"]
return {"title" : title, 'company' : company, 'location' : location, 'apply_link' : f"https://stackoverflow.com/jobs/{job_id}"}
def extract_jobs(last_page):
jobs = []
for page in range(last_page):
result = requests.get(f"{URL}&pg=page+1")
soup=BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class" : "-job"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
main.py
from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()
jobs = so_jobs + indeed_jobs
print(jobs)
사소한 오탈자 주의하자.
새로 배운 것 : get_text의 쓰임 및 recursive의 효용
Author And Source
이 문제에 관하여(21.1.28), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다 https://velog.io/@sinichy7/21.1.28저자 귀속: 원작자 정보가 원작자 URL에 포함되어 있으며 저작권은 원작자 소유입니다.
우수한 개발자 콘텐츠 발견에 전념 (Collection and Share based on the CC Protocol.)