因为百度图片是动态加载的,所以,用之前直接访问的方式,爬取数据,是行不通的,这就引用了selenium自动化
from selenium import webdriver
import time
from lxml import etree
from urllib import request
def get_page(path,url):
# 声明全局变量
global driver,html
# 开始驱动
driver = webdriver.Chrome(path)
# 访问url
driver.get(url)
i = 0
while i < 3:
time.sleep(3)
# 模拟滚动浏览器右侧的滚动条 滚动到底部 用来加载更多数据
driver.execute_script("scrollTo(0,document.body.scrollHeight)")
i += 1
time.sleep(10)
# 获取数据
html = driver.page_source
return html # 将数据作为返回值,返回给parse函数使用
def parse(html):
# 将数据转换成XML格式
html = etree.HTML(html)
# 筛选数据
link_list = html.xpath('//ul[starts-with(@class,"imglist")]/li/@data-objurl')
# print(link_list)
for link in link_list:
name = link.split("/")[-1][:-4] + ".jpg"
print("正在下载:%s"%name)
# 下载图片
request.urlretrieve(link,"images/" + name)
if __name__ == '__main__':
# 加载路径
path = r"C:\谷歌驱动\chromedriver.exe"
# 要访问的url
url = "http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=%E7%BE%8E%E5%A5%B3"
# 实例化get_page()函数对象
html = get_page(path,url)
try:
parse(html)
except:
pass
# 执行完毕退出
driver.quit()