분석 라 이브 러 리

7390 단어
정규 표현 식
      
           :
            . :          
            []:            [aoe]  [a-w]
            \d:   [0-9]
            \D:    
            \w:   、  、   、  
            \W:  \w
            \s:        
            \S:    
            
             *:       >=0 
             +:     
             ?:       0    
            {m}:  m 
            {m, }:  m 
            {m, n}: m-n  
          :
            \b  \B 
            $:      
            ^:      
          :
            ()        (ab){3}
            ()     \     \1  \2
            
            
            .*?  .+?
        
        re.I:     
        re.M:    
        re.S:    
        
        match\search\findall
        re.sub(     ,    ,   )
        pattern(    ,   )
        

xpath
from lxml import etree

# xpath     ,   xml                (html   xml)
# lxml        xml      ,etree lxml       ,     xml     
#       (    )
# 1. etree   html       ,       
html_tree = etree.parse("./test.html")
print(html_tree)
# 2.            
ret = html_tree.xpath("/html/body/div/ol[1]/li/text()")
ret = html_tree.xpath("/html/body/div/div[2]/a/@href")   # xpath         @     
#           
# 3.  
#       "/"           "//"      
ret = html_tree.xpath("//li/text()")
#     
#      class   li
ret = html_tree.xpath("//li[@class]")
#      class  hehe li
ret = html_tree.xpath('//li[@class="haha pp"]')
ret = html_tree.xpath('//li[@class="haha pp"]')  #                         
#     
#   class  h   li
ret = html_tree.xpath('//li[starts-with(@class, "h")]')
#   class    a li
ret = html_tree.xpath('//li[contains(@class, "a")]')
# ret = html_tree.xpath('//li[ends-with(@class, "a")]')   #                

#     
#     class  hehe  id  tata li
ret = html_tree.xpath("//li[@class='hehe' and @id='tata']")

#      class  hehe    a1 
ret = html_tree.xpath("//li[@class='hehe' or contains(@class, 'a')]")


obj = html_tree.xpath("//div[@id='pp']")[0]
print(obj)

#  obj  ,       
# ret = obj.xpath('//li/text()')  #       ,   //    html  
ret = obj.xpath(".//li/text()")

# print(ret)


b. bs4
from bs4 import BeautifulSoup

#  html         BeautifulSoup  
soup = BeautifulSoup(open("./soup_test.html", encoding='utf-8'), "lxml")
#   1,  html        2,      (bs4        ,          ,         )
# print(soup)
# 1.          ,                 
# print(soup.title)
# print(soup.li)
# 2.        
obj = soup.a
# print(obj.string)   #         (         ),string          ,    
print(obj.get_text())  #            (              ,         )
# 3.    
print(obj.get("href"))  #  get    
print(obj["href"])  #        
print(obj.attrs)  #          (      )
print(obj.name)  #        

#      
# print(soup.body.children)
# for child in soup.body.children:
#     print(child)

#              
# print(soup.body.descendants)
# for i in soup.body.descendants:
#     print(i)

# 4.           
# find  ,      
# print(soup.find("a"))  #      a  
# print(soup.find("a", id='hong'))

# find_all        
# print(soup.find_all("a"))
# print(soup.find_all(["a", "span", "li"]))
# print(soup.find_all(["a", "span", "li"], class_='taohua'))
# print(soup.find_all(["a", "span", "li"], limit=3))

# select  ,  css      
print(soup.select(".tang ul li"))  #      
print(soup.select("li#hong"))  #   
print(soup.select("[name='he']"))   #      


4. 데이터 저장
#     
#   json
def write_to_json(data):
    #      json     
    json_list = []
    for houses in data:
        for house in houses:
            json_list.append(house)

    with open("lianjie.json", "w", encoding="utf-8") as fp:
        fp.write(json.dumps(json_list))


#   csv
def write_to_csv(data):
    #   csv   ,     data        
    items = []
    for houses in data:
        for house in houses:
            item = []
            # house   ,                 .       
            for v in house.values():
                item.append(v)
            items.append(item)
    #   csv
    with open("lianjia.csv", "w") as fp:
        #  fp     csv    
        w = csv.writer(fp)
        #    
        w.writerow(["title", "house", "position", "totalPrice", "unitPrice", "img"])
        #    
        w.writerows(items)


#   mysql
def write_to_mysql(data):
    #     mysql      
    conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", password="qaz1693146287", db="lianjia", charset="utf8")
    #       ,    sql  
    cursor = conn.cursor()
    conn.begin()
    #   sql  
    for houses in data:
        for house in houses:
            sql = 'insert into lianjia values(null, "%s", "%s", "%s", "%s", "%s", "%s")' % (house.get("title"), house.get("house"), house.get("position"), house.get("totalPrice"), house.get("unitPrice"), house.get("img"))
            cursor.execute(sql)
            conn.commit()
    #         
    cursor.close()
    conn.close()

5. selenium
selenium+phatomjs selenium+chrome

selenium:     web       ,selenium                ,           。

  python     ,    selenium webdriver              

  :pip install selenium

selenium     :

​   find_element_by_id()

​   find_elements_by_name()

​   find_elements_by_xpath()

​   find_elements_by_tag_name()

​   find_elements_by_class_name()

​   find_elements_by_css_selector()

​   find_elements_by_link_text()

​     

​   click()   

​   send_keys()

​   switch_to_alert()

   iframe
driver.switch_to.frame(driver.find_element_by_id("login_frame"))

      
driver.page_source

chromedriver:       

      :


법 1: driver = webdriver. Chrome (r '/ Users / fanjianbo / Desktop / chromedriver')


법 2: chromedriver 의 디 렉 터 리 를 환경 변수 경로 로 설정 한 다음 driver = webdriver. Chrome ()

【  】chromedriver     chrome      ,         


구 글 브 라 우 저 를 조작 하 는 페이지 다운로드:http://chromedriver.storage.googleapis.com/index.html 혹은http://npm.taobao.org/mirrors/chromedriver/2.37/


구 글 드라이브 와 구 글 브 라 우 저 버 전 사이 의 맵:http://blog.csdn.net/huilan_same/article/details/51896672

phantomjs:      


로 딩 방법 은 다음 과 같 습 니 다:

  :driver = webdriver.PhatomJS("C:\Users\ZBLi\Desktop\1706\day04\ziliao\phantomjs-2.1.1-windows\bin\phantomjs.exe")

  : phantomjs   c  ,  bin         ,  driver = webdriver.PhatomJS()

【  】phantomjs        


다운로드 주소:http://phantomjs.org/download.html


좋은 웹페이지 즐겨찾기