python은 xpath를 이용하여 인터넷 데이터를 추출하여django모델에 저장합니다

친구를 도와 사이트를 만들려면 제품 데이터 정보가 필요합니다. 다른 회사의 제품을 대리하기 때문에 대리회사의 제품 데이터를 직접 찾습니다.

1. 디자인 데이터베이스


from django.db import models
from uuslug import slugify
import uuid
import os


def products_directory_path(instance, filename):
  ext = filename.split('.')[-1]
  filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join('images', "products", instance.title, filename)


def product_relatedimage_directory_path(instance, filename):
  ext = filename.split('.')[-1]
  filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext)
  # return the whole path to the file
  return os.path.join('images', "product_relatedimage", instance.product.title, filename)


class ProductsCategory(models.Model):
  """ """
  name = models.CharField(' ', max_length=80, unique=True)
  description = models.TextField(' ', blank=True, null=True)
  slug = models.SlugField('slug', max_length=80, blank=True, null=True)
  parent_category = models.ForeignKey('self', verbose_name=" ", blank=True, null=True, on_delete=models.CASCADE)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  def __str__(self):
    return self.name

  class Meta:
    ordering = ['name']
    verbose_name = " "
    verbose_name_plural = verbose_name


class ProductsTag(models.Model):
  """ """
  name = models.CharField(' ', max_length=30, unique=True)
  slug = models.SlugField('slug', max_length=40)

  def __str__(self):
    return self.name

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.name)
    super().save(*args, **kwargs)

  class Meta:
    ordering = ['name']
    verbose_name = " "
    verbose_name_plural = verbose_name


class Product(models.Model):
  title = models.CharField(' ', max_length=255, unique=True)
  slug = models.SlugField('slug', max_length=255, blank=True, null=True)
  jscs = models.TextField(' ', blank=True, null=True)
  image = models.ImageField(upload_to=products_directory_path, verbose_name=" ")
  views = models.PositiveIntegerField(' ', default=0)
  category = models.ForeignKey('ProductsCategory', verbose_name=' ', on_delete=models.CASCADE, blank=True, null=True)
  tags = models.ManyToManyField('ProductsTag', verbose_name=' ', blank=True)

  def save(self, *args, **kwargs):
    if not self.id or not self.slug:
      self.slug = slugify(self.title)
    super().save(*args, **kwargs)

  def update_views(self):
    self.views += 1
    self.save(update_fields=['views'])

  def get_pre(self):
    return Product.objects.filter(id__lt=self.id).order_by('-id').first()

  def get_next(self):
    return Product.objects.filter(id__gt=self.id).order_by('id').first()

  def __str__(self):
    return self.title

  class Meta:
    verbose_name = " "
    verbose_name_plural = verbose_name


class ProductAdvantage(models.Model):
  content = models.TextField(' ', blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.content

  class Meta:
    verbose_name = " "
    verbose_name_plural = verbose_name


class ProductBody(models.Model):
  body = models.CharField(' ', max_length=256, blank=True, null=True)
  product = models.ForeignKey(Product, on_delete=models.CASCADE, blank=True, null=True)

  def __str__(self):
    return self.product.title

  class Meta:
    verbose_name = " "
    verbose_name_plural = verbose_name

2. 스크립트 작성


2.1 웹 소스 코드 함수 가져오기

def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    return None
2.2 베이스 페이지에 따라 모든 제품 분류 페이지 링크 가져오기

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    print(url)
2.3 제품 분류 페이지 링크에 따라 모든 제품 링크 가져오기

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  
  catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
  print(" :" + catgory[0])
  #  url
  urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
  #  url
  for url in urls:
    url = 'http://www.kexinjianji.com' + url
    print(url)
  print("=====================================================")
이 두 가지를 결합하면 모든 제품 링크를 인쇄할 수 있다

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    #  
    catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    print(" :" + catgory[0])
    #  url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    #  url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      print(url)
    print("=====================================================")
2.2 xpath 해석 함수를 사용하여 제품 링크의 내용을 되돌려줍니다.

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  
  title = tree.xpath('//*[@id="wrap"]//h1/text()')
  images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
  #  
  images_url = 'http://www.kexinjianji.com/' + images[0]
  #  
  xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
  #  
  jscs = tree.xpath('//table')[0]
  jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
  #  
  cpnr = tree.xpath('//div[@class="describe"]/p')
  print(' :' + title[0])
  print(' :' + images_url)
  for td in xntd:
    print(' :' + td)
  print(' :' + jscs_str)
  for cp in cpnr:
    # string(.)  
    cp = cp.xpath('string(.)')
    print(' :' + cp)
  print('============================================')
세 가지를 결합하면 모든 제품 정보를 얻을 수 있다

if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    #  
    catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    #  url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    #  url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        #  
        title = tree.xpath('//*[@id="wrap"]//h1/text()')
        images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
        #  
        images_url = 'http://www.kexinjianji.com' + images[0]
        #  
        xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
        #  
        jscs = tree.xpath('//table')[0]
        jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
        #  
        cpnr = tree.xpath('//div[@class="describe"]/p')
        print(" :" + catgory[0])
        print(' :' + url)
        print(' :' + title[0])
        print(' :' + images_url)
        for td in xntd:
          print(' :' + td.strip())
        # print(' :' + jscs_str)
        for cp in cpnr:
          # string(.)  
          cp = cp.xpath('string(.)')
          print(' :' + cp)
        print('============================================')
      except Exception as e:
        print(e)
        print(' url:' + url)
        pass

3. django 모델에 저장


import requests
from lxml.html import etree
import os
import django
import uuid
from django.core.files.base import ContentFile

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "jiaobanzhan.settings")
django.setup()

from products.models import ProductBody, ProductsCategory, Product, ProductAdvantage

url = 'http://www.kexinjianji.com/product/hzshntjbz_1/'


def get_one_page(url):
  try:
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    res = requests.get(url=url, headers=headers, timeout=10)
    res.encoding = 'utf-8'
    if res.status_code == 200:
      return res.text
    else:
      return None
  except Exception:
    print('aa')
    return None


if __name__ == '__main__':
  content = get_one_page(url)
  tree = etree.HTML(content)
  #  url
  catgory_urls = tree.xpath('//div[@class="fdh-01-nav"]/div/h3/a/@href')
  #  catgory_urls
  for url in catgory_urls:
    url = 'http://www.kexinjianji.com' + url
    content = get_one_page(url)
    tree = etree.HTML(content)
    #  
    p_catgory = tree.xpath('//div[@class="cplb-3n-ts-03 b"]/h3/span/text()')
    #  url
    urls = tree.xpath('//div[@class="cplb-3n-ts-03-list"]/dl/dt/a/@href')
    #  url
    for url in urls:
      url = 'http://www.kexinjianji.com' + url
      content = get_one_page(url)
      try:
        tree = etree.HTML(content)
        #  
        title = tree.xpath('//*[@id="wrap"]//h1/text()')
        images = tree.xpath('//div[@class="sol_tj_left"]/a/img/@src')
        #  
        images_url = 'http://www.kexinjianji.com' + images[0]
        #  
        xntd = tree.xpath('//div[@class="w"]//div/span/text()|//div[@class="w"]//div/text()')
        #  
        jscs = tree.xpath('//table')[0]
        jscs_str = etree.tostring(jscs, encoding='utf-8').decode('utf-8')
        #  
        cpnr = tree.xpath('//div[@class="describe"]/p')
        #  , 
        catgory = p_catgory[0]
        products_catgory = ProductsCategory.objects.filter(name=catgory).exists()
        if products_catgory:
          products_catgory = ProductsCategory.objects.get(name=catgory)
        else:
          products_catgory = ProductsCategory(name=catgory)
          products_catgory.save()
        print(products_catgory)

        #  
        image_content = requests.get(url=images_url)
        ext = images_url.split('.')[-1] #  
        filename = '{}.{}'.format(uuid.uuid4().hex[:8], ext) #  
        upload_image_file = ContentFile(image_content.content, name=filename) #  django 
        product = Product(title=title[0], jscs=jscs_str, image=upload_image_file, category=products_catgory)
        product.save()
        for td in xntd:
          product_advantage = ProductAdvantage()
          product_advantage.content = td
          product_advantage.product = product
          product_advantage.save()
        for cp in cpnr:
          cp = cp.xpath('string(.)')
          product_body = ProductBody()
          product_body.body = cp
          product_body.product = product
          product_body.save()
      except Exception as e:
        print(e)
        print(' url:' + url)
마지막으로 수동 처리 오류 URL (페이지에서 기술 파라미터를 얻지 못했습니다. 기술 파라미터는 그림입니다.)

4. 요약


1.xpath가 라벨 내용을 가져올 때 p 라벨에 span 라벨을 삽입합니다. 원본은 다음과 같습니다.

<div class="describe" style="position: relative;"> 
   <p><span>    :</span>1500mm</p> 
   <p><span>    :</span>4.5 mm</p> 
   <p><span> :</span>6 </p> 
   <p><span>    :</span>6000 kg</p>
</div>
xpath를 사용하여 p 태그 내용 가져오기
제가 얻고 싶은 효과는 다음과 같습니다.
판폭: 1500mm
두께: 4.5mm
재료 배출구: 6입
무게: 6000kg
다음 xpath를 사용하면 원하는 효과가 아니라 따로 얻을 수 있습니다

//div[@class="describe"]/p/span/text()|//div[@class="describe"]/p/text()
바이두 이후에 찾은 해결 방법은 xpath("string(.)"를 사용한다
1. 모든 p 레이블 가져오기

cpnr = tree.xpath('//div[@class="describe"]/p')
2. **string(.)** 사용하기모든 태그 모든 텍스트 가져오기

cp = cp.xpath('string(.)')
모든 p 태그를 순환하면
이는python이 xpath를 이용하여 인터넷 데이터를 추출하여django모델에 저장하는 것에 관한 글을 소개합니다. 더 많은 관련 xpath가 인터넷 데이터를 추출하여django모델에 저장하는 내용은 저희 이전의 글을 검색하거나 아래의 관련 글을 계속 훑어보시기 바랍니다!

좋은 웹페이지 즐겨찾기