인터넷 파충류--25.【selenium 실전】그물파충류를 잡아당기는 --selenium 데이터 얻기
17861 단어 네트워크 파충류
코드 구현 #encoding: utf-8
from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
driver_path = r"D:\Program Files\chromedriver_win32\chromedriver.exe"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
self.positions = []
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
WebDriverWait(driver=self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]"))
)
self.parse_list_page(source)
try:
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
except:
print(source)
time.sleep(1)
def parse_list_page(self,source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self,url):
# self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url)
self.driver.switch_to.window(self.driver.window_handles[1])
WebDriverWait(self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))
)
source = self.driver.page_source
self.parse_detail_page(source)
#
self.driver.close()
#
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self,source):
html = etree.HTML(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath(".//text()")[0].strip()
city = re.sub(r"[\s/]", "", city)
work_years = job_request_spans[2].xpath(".//text()")[0].strip()
work_years = re.sub(r"[\s/]", "", work_years)
education = job_request_spans[3].xpath(".//text()")[0].strip()
education = re.sub(r"[\s/]", "", education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()
position = {
'name': position_name,
'company_name': company_name,
'salary': salary,
'city': city,
'work_years': work_years,
'education': education,
'desc': desc
}
self.positions.append(position)
print(position)
print('='*40)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
인터넷 파충류--25.【selenium 실전】그물파충류를 잡아당기는 --selenium 데이터 얻기
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.
#encoding: utf-8
from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
driver_path = r"D:\Program Files\chromedriver_win32\chromedriver.exe"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
self.positions = []
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
WebDriverWait(driver=self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]"))
)
self.parse_list_page(source)
try:
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
except:
print(source)
time.sleep(1)
def parse_list_page(self,source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
self.request_detail_page(link)
time.sleep(1)
def request_detail_page(self,url):
# self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url)
self.driver.switch_to.window(self.driver.window_handles[1])
WebDriverWait(self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))
)
source = self.driver.page_source
self.parse_detail_page(source)
#
self.driver.close()
#
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self,source):
html = etree.HTML(source)
position_name = html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath(".//text()")[0].strip()
city = re.sub(r"[\s/]", "", city)
work_years = job_request_spans[2].xpath(".//text()")[0].strip()
work_years = re.sub(r"[\s/]", "", work_years)
education = job_request_spans[3].xpath(".//text()")[0].strip()
education = re.sub(r"[\s/]", "", education)
desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()
position = {
'name': position_name,
'company_name': company_name,
'salary': salary,
'city': city,
'work_years': work_years,
'education': education,
'desc': desc
}
self.positions.append(position)
print(position)
print('='*40)
if __name__ == '__main__':
spider = LagouSpider()
spider.run()
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
인터넷 파충류--25.【selenium 실전】그물파충류를 잡아당기는 --selenium 데이터 얻기텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.