장보러 가는 중고 물품류를 찾아서 모든 상품의 상세한 상황을 보다

3773 단어
1.page_parsing.py: 페이지 코드 해석 #-8 - coding utf-8 - * - import requests,time,pymongo,lxml,randomfrom bs4 import BeautifulSoup
headers={
'User-Agent':'*****************************    Safari/537.36',

}
client=pymongo.MongoClient('localhost',27017)
items=client['itmes']
items_link=items['items_link']
# items_info=items['items_info']
items_info9=items['items_info9']

#proxies=[]
#proxy_ip=random.choice(proxies)

# spider1         
# spider2             
# spider3            


#spider1
start_url='http://bj.ganji.com/wu/'
host='http://bj.ganji.com'
def get_class_links(start_url):
    wb_data=requests.get(start_url,headers=headers)
    #if wb_data.status_code()==200:
    soup=BeautifulSoup(wb_data.text,'lxml')
    channels=soup.select('dt > a')
    channels_url=[]
    for url in channels:
         channel_url=host+url.get('href')
         channels_url.append(channel_url)
         print(channels_url)

#get_class_links(start_url)


#spider2             
# channels_url='http://bj.ganji.com/shouji/'
def get_items_from(channels_url):
     for page in range(1,100):
           channel_url="{}pn{}/".format(channels_url,page)
           wb_data=requests.get(channel_url,headers=headers)
           if wb_data.status_code ==200:   #         ip    ip
           soup=BeautifulSoup(wb_data.text,'lxml')
           for item in soup.select(' dd.feature > div > ul > li > a'):
                item_link=item.get('href')
                item_data={
                'item':item_link
            }
                get_items_info(item_link)
                items_link.insert_one(item_data)
                print(item_data)

# get_items_from(channels_url)

#spider3            
#item_link='http://bj.ganji.com/shouji/2079187773x.htm'
def get_items_info(item_link):
    wb_data=requests.get(item_link,headers=headers)
    if wb_data.status_code ==200: #         ip    ip
        soup=BeautifulSoup(wb_data.text,'lxml')
        titles = soup.select('.title-name')
        times = soup.select('.pr-5')
        types = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')
        prices = soup.select('i.f22')
        adrs = soup.select('ul.det-infor > li:nth-of-type(3)')
        cates=soup.select('div.h-crumbs')
        qualities = soup.select(' div.leftBox > div:nth-of-type(4) > div.det-    summary > div > div ')
        for title, time, type, price, adr, cate, quality in zip(titles, times, types, prices, adrs, qualities,cates):
             items_data = {
            'title': title.get_text(),
            'times': time.get_text().split(),
            'type': type.get_text(),
            'price': price.get_text(),
            'adr': list(adr.stripped_strings),
            'qualities': list(quality.stripped_strings),
            'cate':cate.get_text()
        }
           items_info9.insert_one(items_data)
            print(items_data)
  • main.py: 주 코드
  • from multiprocessing import Pool
    from parsing_web import get_items_from,get_items_info
    from channels_url import channels_url
    import pymongo
    
    #channels_url=['http://bj.ganji.com/shouji/']
    if __name__=='__main__':
        pool=Pool(processes=6)
        pool.map(get_items_from,channels_url)
        pool.close()
        pool.join()
    

    3.count.py: 실시간(3초 간격) 상품 수량 계산 코드
    from parsing_web import items_link
    from parsing_web import items_info9
    import time
    while True:
        print(items_info9.find().count())
        time.sleep(3)

    좋은 웹페이지 즐겨찾기