Week2_Practice1

3466 단어

MainInformation.py


함수 기능: 셋방 페이지 정보 얻기
import requests
from bs4 import BeautifulSoup
import time
def getMainInformation(url):
    html=requests.get(url)
    bsHtml=BeautifulSoup(html.text,'lxml')

    data={
        'title':'hello',
        'price':'hello',
        'sex':'hello',
        'name':'hello',
        'photo':'hello',
        'add':'hello',
        'ownerPhoto':'hello'
    }
    #  
    midTitle=bsHtml.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
    for i in midTitle:
        data['title']=i.get_text()


    #  
    midAdd=bsHtml.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
    for i in midAdd:
        data['add']=i.get_text().strip()


    #  
    midPri=bsHtml.select('#pricePart > div.day_l > span')
    for i in midPri:
        data['price']=i.get_text()


    #  
    midOwnerPhone=bsHtml.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
    for i in midOwnerPhone:
        data['ownerPhoto']=i.get('src')


    #  
    midSex=bsHtml.select('div[class="member_pic"] > div')
    for i in midSex:
        x=i.get('class')
        if x[0]=='member_ico':
           data['sex']='male'
        else:
            data['sex']='female'


    #  
    midName=bsHtml.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
    for i in midName:
        data['name']=i.get_text()


    #  
    midPh = bsHtml.select('#detailImageBox > div.pho_show_r > div > ul > li > img[data-width="800"]')

    for i in midPh:
        data['photo'] = i.get('data-src')
        break
    time.sleep(1)
    return data


#  
url='http://bj.xiaozhu.com/fangzi/1466098635.html'
getMainInformation(url)

getPageHref.py


함수 기능: 셋방 링크 가져오기
import requests
from bs4 import BeautifulSoup
import time
def getPageHref(url):
    html=requests.get(url)
    bsHtml=BeautifulSoup(html.text,'lxml')

    #  
    hrefs=bsHtml.select('#page_list > ul > li > a[class="resule_img_a"]')
    self_hrefs=[]
    for i in hrefs:
        self_hrefs.append(i.get('href'))
    time.sleep(1)
    #  
    return self_hrefs


#  
url='http://bj.xiaozhu.com/search-duanzufang-p1-0/'
getPageHref(url)

Main.py


함수 기능: 집값이 400보다 큰 전세 정보를 선별
import getPageHref
import MainInformation
import pymongo


def main():
    urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,3)]
    client=pymongo.MongoClient('localhost',27017)
    houseMess=client['houseMess']
    sheet_tab=houseMess['sheet_tab']
    # for url in urls:
    #     urlss=getPageHref.getPageHref(url)
    #     for i in urlss:
    #         data=MainInformation.getMainInformation(i)
    #         sheet_tab.insert_one(data)

    #  400 
    house=[]
    for i in sheet_tab.find():
         if eval(i['price'])>=400:
             house.append(i)
    for i in house:
        print(i)
main()

좋은 웹페이지 즐겨찾기