스크래핑 시작
11385 단어 파이썬셀레늄WindowsBeautifulSoup
소개
준비
Google 크롬
WebDriver
chromedriver
【폴더 구성】
파이썬
selenium
PyPI - selenium
공식 사이트
selenium
pip install selenium
Beautifulsoup4
PyPI - beautifulsoup4
일본어 문서
BS4
pip install beautifulsoup4
Hello Beautifulsoup4!
scraping.pyfrom bs4 import BeautifulSoup
html_doc = """
<!DOCTYPE html>
<html>
<head>
<title>TEST SOUP</title>
</head>
<body>
<h1>Hello BS4</h1>
<p class="font-big">python scraping</p>
<button id="start" @click="getURI">Start</button>
<ul>
<li><a href="https://www.yahoo.co.jp">Yahoo</a></li>
<li><a href="https://www.google.co.jp">Google</a></li>
<li><a href="https://www.amazon.co.jp/">Amazon</a></li>
</ul>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.h1)
print(soup.p)
print(soup.p['class'])
print(soup.button)
print(soup.find(id='start'))
print(soup.a)
print(soup.find_all('a'))
for link in soup.find_all('a'):
print(link.get('href'))
print(soup.get_text())
Hello Selenium!
driver# 要素の指定方法
#driver.find_element_by_id('ID')
#driver.find_element_by_class_name('CLASS_NAME')
#driver.find_element_by_name('NAME')
#driver.find_element_by_css_selector('CSS_SELECTOR')
#driver.find_element_by_xpath('XPath')
#driver.find_element_by_link_text('LINK_TEXT')
#driver.find_element_by_partial_link_text('LINK_TEXT')
# 要素の操作
#driver.find_element_by_id('ID').click()
#el = driver.find_element_by_id('ID')
#driver.execute_script("arguments[0].click();", el)
#driver.find_element_by_id('ID').send_keys('STRINGS')
#driver.find_element_by_id('ID').text
#driver.find_element_by_id('ID').get_attribute('ATTRI_NAME')
#driver.find_element_by_id('ID').clear()
# ページ操作
#driver.back()
#driver.forward()
#driver.refresh()
#driver.close()
#driver.quit()
selenium.pyimport time
import os
os.environ['PATH'] = os.getenv('PATH') + './Scripts/chromedriver_binary;'
# WebDriver: https://sites.google.com/a/chromium.org/chromedriver/downloads
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
HEADLESS = False
URL = 'https://docs.python.org/ja/3/py-modindex.html'
SELECTOR = 'body > div.footer'
op = Options()
if HEADLESS:
op.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=op)
driver.get(URL)
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, SELECTOR))
)
code_tag = SoupStrainer('code')
sp = BeautifulSoup(driver.page_source, features='html.parser', parse_only=code_tag)
for c in sp.find_all('code'):
print(c.string)
driver.quit()
결론
from bs4 import BeautifulSoup
html_doc = """
<!DOCTYPE html>
<html>
<head>
<title>TEST SOUP</title>
</head>
<body>
<h1>Hello BS4</h1>
<p class="font-big">python scraping</p>
<button id="start" @click="getURI">Start</button>
<ul>
<li><a href="https://www.yahoo.co.jp">Yahoo</a></li>
<li><a href="https://www.google.co.jp">Google</a></li>
<li><a href="https://www.amazon.co.jp/">Amazon</a></li>
</ul>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
print(soup.title)
print(soup.title.name)
print(soup.title.string)
print(soup.title.parent.name)
print(soup.h1)
print(soup.p)
print(soup.p['class'])
print(soup.button)
print(soup.find(id='start'))
print(soup.a)
print(soup.find_all('a'))
for link in soup.find_all('a'):
print(link.get('href'))
print(soup.get_text())
# 要素の指定方法
#driver.find_element_by_id('ID')
#driver.find_element_by_class_name('CLASS_NAME')
#driver.find_element_by_name('NAME')
#driver.find_element_by_css_selector('CSS_SELECTOR')
#driver.find_element_by_xpath('XPath')
#driver.find_element_by_link_text('LINK_TEXT')
#driver.find_element_by_partial_link_text('LINK_TEXT')
# 要素の操作
#driver.find_element_by_id('ID').click()
#el = driver.find_element_by_id('ID')
#driver.execute_script("arguments[0].click();", el)
#driver.find_element_by_id('ID').send_keys('STRINGS')
#driver.find_element_by_id('ID').text
#driver.find_element_by_id('ID').get_attribute('ATTRI_NAME')
#driver.find_element_by_id('ID').clear()
# ページ操作
#driver.back()
#driver.forward()
#driver.refresh()
#driver.close()
#driver.quit()
import time
import os
os.environ['PATH'] = os.getenv('PATH') + './Scripts/chromedriver_binary;'
# WebDriver: https://sites.google.com/a/chromium.org/chromedriver/downloads
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
HEADLESS = False
URL = 'https://docs.python.org/ja/3/py-modindex.html'
SELECTOR = 'body > div.footer'
op = Options()
if HEADLESS:
op.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=op)
driver.get(URL)
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, SELECTOR))
)
code_tag = SoupStrainer('code')
sp = BeautifulSoup(driver.page_source, features='html.parser', parse_only=code_tag)
for c in sp.find_all('code'):
print(c.string)
driver.quit()
Reference
이 문제에 관하여(스크래핑 시작), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다 https://qiita.com/sireline/items/2621b87032975427bfb7텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
우수한 개발자 콘텐츠 발견에 전념 (Collection and Share based on the CC Protocol.)