问题描述
我想从这个网站抓取数据:“https://sephora.ae” 我写了这段代码
import requests
from bs4 import BeautifulSoup
import json
def sephora(URL):
# opening our output file in append mode
File = open("out.csv","a")
print("function start")
# specifying user agent,You can use other user agents
# available on the internet
print(URL)
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US,en;q=0.5'})
# Making the HTTP Request
webpage = requests.get(URL,headers=HEADERS)
# Creating the Soup Object containing all data
soup = BeautifulSoup(webpage.content,'html.parser')
print("a")
#brand name
try:
brand = soup.find_all('span',{"class": "brand-name"})
except AttributeError:
brand = 'NA'
#Product Name
try:
prname = soup.find_all('Meta',{"itemprop": "name"})["content"].text
except AttributeError:
prname = 'NA'
#price
try:
price = soup.find_all('Meta',{"itemprop": "price"})["content"].text
except AttributeError:
price = 'NA'
#price currency
try:
price_cur = soup.find_all(
'Meta',{"itemprop": "priceCurrency"})["content"].text
except AttributeError:
price_cur = 'NA'
#item code
try:
item_code = soup.find_all('Meta',{"itemprop": "sku"})["content"].text
except AttributeError:
item_code = 'NA'
#variant url
try:
urll = soup.find_all('Meta',{"itemprop": "url"})["content"].text
except AttributeError:
urll = 'NA'
#category
try:
category = soup.find_all('span',{"class": "product-name"}).text
except AttributeError:
category = 'NA'
#size
try:
size = soup.find_all('img',{"class": "variation-image"})["alt"].text
except AttributeError:
size = 'NA'
#image_src
image_tags = soup.find_all("a",{"class": "variation-display-name"},{"rel": "nofollow"})
imagess = image_tags["data-lgimg"]
res = json.loads(imagess)
img_src = res['url']
img_title = res['title']
img_alt = res['alt']
#description
try:
desc_div = soup.find_all('div',{"class": "product-description-Box"})
except AttributeError:
desc_div = 'NA'
# print(img_src)
print(img_alt)
# print(Item_code)
# print(Size)
File.write(f"{URL};")
File.write(f"{brand};")
File.write(f"{prname};")
File.write(f"{category};")
File.write(f"{urll};")
File.write(f"{desc_div};")
# File.write(f"{ingredients};")
File.write(f"{price};")
File.write(f"{price_cur};")
File.write(f"{img_src};")
File.write(f"{img_title};")
File.write(f"{item_code};")
File.write(f"{size}\n")
if __name__ == '__main__':
# opening our url file to access URLs
print("start")
file = open("url.txt","r")
header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
File = open("out.csv","w")
File.write(f"{header}\n")
File.close()
URLs = file.readlines()
for links in URLs:
sephora(links)
File.close()
但是在 out.csv 中,当我使用 repl.it 运行时,只有标题出现,没有其他内容。 当我用 cmd 运行时,就会出现这个错误
C:\Users\Admin\Desktop\sephora>python main.py
Traceback (most recent call last):
File "C:\Users\Admin\Desktop\sephora\main.py",line 137,in <module>
sephora(links)
File "C:\Users\Admin\Desktop\sephora\main.py",line 33,in sephora
prname = soup.find_all('Meta',{"itemprop": "name"})["content"]
TypeError: list indices must be integers or slices,not str
请帮我解决这个问题
我使用这个 URL 开始
https://www.sephora.ae/en/p/color-lip-last-lipstick-P1074023.html
谢谢
解决方法
首先,我认为您需要将 HEADERS 类型从元组更改为字典:
HEADERS = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/44.0.2403.157 Safari/537.36','Accept-Language': 'en-US,en;q=0.5'}
then soup.find_all('meta',{"itemprop": "name"})
返回元素列表
你需要改变:
prname = soup.find_all('meta',{"itemprop": "name"})["content"].text
到
prname = soup.find_all('meta',{"itemprop": "name"})[0].text
category = soup.find_all('span',{"class": "product-name"}).text
到category = soup.find_all('span',{"class": "product-name"})[0].text
size = soup.find_all('img',{"class": "variation-image"})["alt"].text
到size = soup.find_all('img',{"class": "variation-image"})[0]["alt"]
之类的
文档:https://beautiful-soup-4.readthedocs.io/en/latest/#searching-the-tree
,您似乎对 BeautifulSoup 的一些概念/方法感到困惑。特别是抓取属性与抓取文本/内容。还有对 .find_all()
与 .find()
的区别的理解。 .find_all()
将返回所有这些元素的列表。因此,如果您想要该列表中的特定项目,则需要使用索引。 .find()
将只返回它找到的第一个具有您要查找的特定标签和属性的元素。
试试这个:
import requests
from bs4 import BeautifulSoup
import json
def sephora(URL):
# opening our output file in append mode
File = open("out.csv","a")
print("function start")
# specifying user agent,You can use other user agents
# available on the internet
print(URL)
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,en;q=0.5'}
# Making the HTTP Request
webpage = requests.get(URL,headers=HEADERS)
# Creating the Soup Object containing all data
soup = BeautifulSoup(webpage.content,'html.parser')
print("a")
#brand name
try:
brand = soup.find('span',{"class": "brand-name"}).text
except AttributeError:
brand = 'NA'
#Product Name
try:
prname = soup.find('meta',{"itemprop": "name"})["content"]
except AttributeError:
prname = 'NA'
#price
try:
price = soup.find('meta',{"itemprop": "price"})["content"]
except AttributeError:
price = 'NA'
#price currency
try:
price_cur = soup.find(
'meta',{"itemprop": "priceCurrency"})["content"]
except AttributeError:
price_cur = 'NA'
#item code
try:
item_code = soup.find('meta',{"itemprop": "sku"})["content"]
except AttributeError:
item_code = 'NA'
#variant url
try:
urll = soup.find('meta',{"itemprop": "url"})["content"]
except AttributeError:
urll = 'NA'
#category
try:
category = soup.find('span',{"class": "product-name"}).text
except AttributeError:
category = 'NA'
#size
try:
size = soup.find('img',{"class": "variation-image"})["alt"]
except AttributeError:
size = 'NA'
#image_src
image_tags = soup.find("a",{"class": "variation-display-name"},{"rel": "nofollow"})
imagess = image_tags["data-lgimg"]
res = json.loads(imagess)
img_src = res['url']
img_title = res['title']
img_alt = res['alt']
#description
try:
desc_div = soup.find('div',{"class": "product-description-box"}).text.strip()
except AttributeError:
desc_div = 'NA'
# print(img_src)
print(img_alt)
# print(Item_code)
# print(Size)
File.write(f"{URL};")
File.write(f"{brand};")
File.write(f"{prname};")
File.write(f"{category};")
File.write(f"{urll};")
File.write(f"{desc_div};")
# File.write(f"{ingredients};")
File.write(f"{price};")
File.write(f"{price_cur};")
File.write(f"{img_src};")
File.write(f"{img_title};")
File.write(f"{item_code};")
File.write(f"{size}\n")
if __name__ == '__main__':
# opening our url file to access URLs
print("start")
file = open("url.txt","r")
header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
File = open("out.csv","w")
File.write(f"{header}\n")
File.close()
URLs = file.readlines()
for links in URLs:
sephora(links)
File.close()