在 bs4 中抓取数据的问题

问题描述

我想从这个网站抓取数据：“https://sephora.ae” 我写了这段代码

import requests
from bs4 import BeautifulSoup
import json



def sephora(URL):
    # opening our output file in append mode
    File = open("out.csv","a")
    print("function start")
    # specifying user agent,You can use other user agents
    # available on the internet
    print(URL)
    HEADERS = ({'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64)                  AppleWebKit/537.36 (KHTML,like Gecko)                     Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US,en;q=0.5'}) 
 
    
    # Making the HTTP Request 
    webpage = requests.get(URL,headers=HEADERS)  
    
    # Creating the Soup Object containing all data 
    soup = BeautifulSoup(webpage.content,'html.parser')
    print("a")



    #brand name
    try:
        brand = soup.find_all('span',{"class": "brand-name"})
    except AttributeError:
        brand = 'NA'

    #Product Name
    try:
        prname = soup.find_all('Meta',{"itemprop": "name"})["content"].text
    except AttributeError:
        prname = 'NA'

    #price
    try:
        price = soup.find_all('Meta',{"itemprop": "price"})["content"].text
    except AttributeError:
        price = 'NA'

    #price currency
    try:
        price_cur = soup.find_all(
            'Meta',{"itemprop": "priceCurrency"})["content"].text
    except AttributeError:
        price_cur = 'NA'

    #item code
    try:
        item_code = soup.find_all('Meta',{"itemprop": "sku"})["content"].text
    except AttributeError:
        item_code = 'NA'

    #variant url
    try:
        urll = soup.find_all('Meta',{"itemprop": "url"})["content"].text
    except AttributeError:
        urll = 'NA'

    #category
    try:
        category = soup.find_all('span',{"class": "product-name"}).text
    except AttributeError:
        category = 'NA'

    #size
    try:
        size = soup.find_all('img',{"class": "variation-image"})["alt"].text
    except AttributeError:
        size = 'NA'

    #image_src
    image_tags = soup.find_all("a",{"class": "variation-display-name"},{"rel": "nofollow"})
    imagess = image_tags["data-lgimg"]
    res = json.loads(imagess)
    img_src = res['url']
    img_title = res['title']
    img_alt = res['alt']

    #description
    try:
        desc_div = soup.find_all('div',{"class": "product-description-Box"})
    except AttributeError:
        desc_div = 'NA'
    
    # print(img_src)
    print(img_alt)
    # print(Item_code)
    # print(Size)
    File.write(f"{URL};")
    File.write(f"{brand};")
    File.write(f"{prname};")
    File.write(f"{category};")
    File.write(f"{urll};")
    File.write(f"{desc_div};")
    # File.write(f"{ingredients};")
    File.write(f"{price};")
    File.write(f"{price_cur};")
    File.write(f"{img_src};")
    File.write(f"{img_title};")
    File.write(f"{item_code};")
    File.write(f"{size}\n")


if __name__ == '__main__':
    # opening our url file to access URLs
    print("start")
    file = open("url.txt","r")
    header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
    File = open("out.csv","w")
    File.write(f"{header}\n")
    File.close()

    URLs = file.readlines()
   
    for links in URLs:
        sephora(links)
    File.close()

但是在 out.csv 中，当我使用 repl.it 运行时，只有标题出现，没有其他内容。当我用 cmd 运行时，就会出现这个错误

C:\Users\Admin\Desktop\sephora>python main.py
Traceback (most recent call last):
  File "C:\Users\Admin\Desktop\sephora\main.py",line 137,in <module>
    sephora(links)
  File "C:\Users\Admin\Desktop\sephora\main.py",line 33,in sephora
    prname = soup.find_all('Meta',{"itemprop": "name"})["content"]
TypeError: list indices must be integers or slices,not str

请帮我解决这个问题

我使用这个 URL 开始

https://www.sephora.ae/en/p/color-lip-last-lipstick-P1074023.html

谢谢

解决方法

首先，我认为您需要将 HEADERS 类型从元组更改为字典：

HEADERS = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US,en;q=0.5'}

then soup.find_all('meta',{"itemprop": "name"}) 返回元素列表

你需要改变：

prname = soup.find_all('meta',{"itemprop": "name"})["content"].text

到 prname = soup.find_all('meta',{"itemprop": "name"})[0].text

category = soup.find_all('span',{"class": "product-name"}).text

到category = soup.find_all('span',{"class": "product-name"})[0].text

size = soup.find_all('img',{"class": "variation-image"})["alt"].text

到size = soup.find_all('img',{"class": "variation-image"})[0]["alt"]

之类的

文档：https://beautiful-soup-4.readthedocs.io/en/latest/#searching-the-tree

您似乎对 BeautifulSoup 的一些概念/方法感到困惑。特别是抓取属性与抓取文本/内容。还有对 .find_all() 与 .find() 的区别的理解。 .find_all() 将返回所有这些元素的列表。因此，如果您想要该列表中的特定项目，则需要使用索引。 .find() 将只返回它找到的第一个具有您要查找的特定标签和属性的元素。

试试这个：

import requests
from bs4 import BeautifulSoup
import json



def sephora(URL):
    # opening our output file in append mode
    File = open("out.csv","a")
    print("function start")
    # specifying user agent,You can use other user agents
    # available on the internet
    print(URL)
    HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,en;q=0.5'}
 
    
    # Making the HTTP Request 
    webpage = requests.get(URL,headers=HEADERS)  
    
    # Creating the Soup Object containing all data 
    soup = BeautifulSoup(webpage.content,'html.parser')
    print("a")



    #brand name
    try:
        brand = soup.find('span',{"class": "brand-name"}).text
    except AttributeError:
        brand = 'NA'

    #Product Name
    try:
        prname = soup.find('meta',{"itemprop": "name"})["content"]
    except AttributeError:
        prname = 'NA'

    #price
    try:
        price = soup.find('meta',{"itemprop": "price"})["content"]
    except AttributeError:
        price = 'NA'

    #price currency
    try:
        price_cur = soup.find(
            'meta',{"itemprop": "priceCurrency"})["content"]
    except AttributeError:
        price_cur = 'NA'

    #item code
    try:
        item_code = soup.find('meta',{"itemprop": "sku"})["content"]
    except AttributeError:
        item_code = 'NA'

    #variant url
    try:
        urll = soup.find('meta',{"itemprop": "url"})["content"]
    except AttributeError:
        urll = 'NA'

    #category
    try:
        category = soup.find('span',{"class": "product-name"}).text
    except AttributeError:
        category = 'NA'

    #size
    try:
        size = soup.find('img',{"class": "variation-image"})["alt"]
    except AttributeError:
        size = 'NA'

    #image_src
    image_tags = soup.find("a",{"class": "variation-display-name"},{"rel": "nofollow"})
    imagess = image_tags["data-lgimg"]
    res = json.loads(imagess)
    img_src = res['url']
    img_title = res['title']
    img_alt = res['alt']

    #description
    try:
        desc_div = soup.find('div',{"class": "product-description-box"}).text.strip()
    except AttributeError:
        desc_div = 'NA'
    
    # print(img_src)
    print(img_alt)
    # print(Item_code)
    # print(Size)
    File.write(f"{URL};")
    File.write(f"{brand};")
    File.write(f"{prname};")
    File.write(f"{category};")
    File.write(f"{urll};")
    File.write(f"{desc_div};")
    # File.write(f"{ingredients};")
    File.write(f"{price};")
    File.write(f"{price_cur};")
    File.write(f"{img_src};")
    File.write(f"{img_title};")
    File.write(f"{item_code};")
    File.write(f"{size}\n")


if __name__ == '__main__':
    # opening our url file to access URLs
    print("start")
    file = open("url.txt","r")
    header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
    File = open("out.csv","w")
    File.write(f"{header}\n")
    File.close()

    URLs = file.readlines()
   
    for links in URLs:
        sephora(links)
    File.close()

beautifulsoup beautifulsoup cmd cmd python python-requests repl.it