21.1.28

<노마드 코더 웹 스크래핑 수업 통해 웹 크롤링 배워보기2>

  1. indeed.py 및 main.py 정리
import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = "https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit={LIMIT}"

def get_last_page():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")

  pagination = soup.find("div", {"class" : "pagination"})

  links = pagination.find_all('a')

  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  max_page = pages[-1]
  return max_page


def extract_job(html):
    title = html.find("h2", {"class" : "title"}).find("a")["title"]
    company = html.find("span", {"class" :"company"})
    company_anchor = company.find('a')
    if company_anchor is not None:
      company = str(company_anchor.string)
    else:
      company = str(company.string)
    company = company.strip()
    location = html.find("span", {"class" : "location"}).string
    job_id = html["data-jk"]

    return {'title': title, 'company' : company, "location" : location, "link" : f"https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=python&limit=50&vjk={job_id}"}



def extract_jobs(last_page):
  jobs = []
  for page in range(last_page):
    result = requests.get(f"{URL}&start={page*LIMIT}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class" : "jobsearch-SerpJobCard"})
    for result in results:
      job = extract_job(result)
      jobs.append(job)

  return jobs


def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return jobs

모든 자료 일단 indeed.py로 불러오기


from indeed import get_jobs as get_indeed_jobs


indeed_jobs = get_indeed_jobs()
print(indeed_jobs)

main.py 정리

  1. StackOverflow page 크롤링 시작(기본 세팅 ~ 페이지 수 추출, status_code 확인)
import requests
from bs4 import BeautifulSoup


URL = "https://stackoverflow.com/jobs?q=python&sort=i"

def get_last_page():
  result = requests.get(URL)
  soup=BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
  last_page = pages[-2].get_text(strip = True)
  return int(last_page)


def extract_jobs(last_page):
  jobs = []
  for page in range(last_page):
    result = requests.get(f"{URL}&pg=page+1")
    print(result.status_code)

def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return
  1. BeautifulSoup 활용 직업 뽑아내기(number 먼저)
import requests
from bs4 import BeautifulSoup


URL = "https://stackoverflow.com/jobs?q=python&sort=i"

def get_last_page():
  result = requests.get(URL)
  soup=BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
  last_page = pages[-2].get_text(strip = True)
  return int(last_page)


def extract_jobs(last_page):
  jobs = []
  for page in range(last_page):
    result = requests.get(f"{URL}&pg=page+1")
    soup=BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class" : "-job"})
    for result in results:
      print(result["data-jobid"])

def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return

직업의 넘버를 먼저 뽑아냄.

그리고 꼭 find_all에서 상위 div만 뽑는단 생각 말고 직업이 있는데에서도 뽑고 위처럼 해보자.

  1. 직업 이름 뽑아내기
import requests
from bs4 import BeautifulSoup


URL = "https://stackoverflow.com/jobs?q=python&sort=i"

def get_last_page():
  result = requests.get(URL)
  soup=BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
  last_page = pages[-2].get_text(strip = True)
  return int(last_page)

def extract_job(html):
  title = html.find("h2", {"class" : "mb4"}).find('a')["title"]
  return {"title" : title}



def extract_jobs(last_page):
  jobs = []
  for page in range(last_page):
    result = requests.get(f"{URL}&pg=page+1")
    soup=BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class" : "-job"})
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  return jobs

def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return
  1. 회사 이름 + 지역 이름크롤링
    span 안에 회사 이름과 지역이 같이 들어있다.-> recursive = False로 깊게 들어가지 않게 리스트 요소 인덱스 2개로 나눠 변수에 넣어준다.
(다른 부분 수정 X)
def extract_job(html):
  title = html.find("h2", {"class" : "mb4"}).find('a')["title"]
  company, location = html.find("h3", {"class" : "mb4"}).find_all("span", recursive = False)
  print(company.get_text(strip = True).strip('-'), location.get_text(strip = True))

  return {"title" : title}
  1. 코드 정리 / 지원 링크 가져오기(마무리)
import requests
from bs4 import BeautifulSoup


URL = "https://stackoverflow.com/jobs?q=python&sort=i"

def get_last_page():
  result = requests.get(URL)
  soup=BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class" : "s-pagination"}).find_all("a")
  last_page = pages[-2].get_text(strip = True)
  return int(last_page)

def extract_job(html):
  title = html.find("h2", {"class" : "mb4"}).find('a')["title"]
  company, location = html.find("h3", {"class" : "mb4"}).find_all("span", recursive = False)
  company = company.get_text(strip = True)
  location = location.get_text(strip = True).strip('-')
  job_id = html["data-jobid"]

  return {"title" : title, 'company' : company, 'location' : location, 'apply_link' : f"https://stackoverflow.com/jobs/{job_id}"}



def extract_jobs(last_page):
  jobs = []
  for page in range(last_page):
    result = requests.get(f"{URL}&pg=page+1")
    soup=BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class" : "-job"})
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  return jobs

def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return jobs

main.py

from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs

indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()
jobs = so_jobs + indeed_jobs
print(jobs)

사소한 오탈자 주의하자.
새로 배운 것 : get_text의 쓰임 및 recursive의 효용

결과 레플잇

좋은 웹페이지 즐겨찾기