写在前面
做点好玩的小程序, 使用selenium爬取百度图片, 主要思路是xpath选择器实现翻页以及读取下载链接, 相当方便的操作. 直接上代码:
# 用于设置强制等待
from time import sleep
import os
import requests
# 浏览器自动化引擎
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
def dirCreate(dirname):
if not os.path.exists(dirname):
print("文件夹:%s不存在, 创建..." % dirname)
os.makedirs(dirname)
else:
print("文件夹:%s已存在, 开始爬取..." % dirname)
def saveFig(figname, url2):
with open(dirname + "/%s" % figname, "wb") as img:
img.write(requests.get(url2).content)
print("图片保存成功!")
def saveUrl(txtname, url1, id1):
ret = str(id1) + "\t" + url1 + "\n"
with open(dirname + "/%s" % txtname, "a", encoding="utf-8") as f:
f.write(ret)
print("链接保存成功!")
if __name__ == '__main__':
# 待抓取的链接
url = '''
https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A6%99%E8%95%89&step_word=&hs=0&pn=2&spn=0&di=7060663421280190465&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=2&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs=3678626669%2C3982988379&os=1030740632%2C2370136271&simid=3547181550%2C510924326&adpicid=0&lpn=0&ln=1733&fr=&fmq=1649555264889_R&fm=detail&ic=0&s=undefined&hd=undefined&latest=undefined©right=undefined&se=&sme=&tab=0&width=&height=&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=https%3A%2F%2Fgimg2.baidu.com%2Fimage_search%2Fsrc%3Dhttp%3A%2F%2Fnimg.ws.126.net%2F%3Furl%3Dhttp%3A%2F%2Fdingyue.ws.126.net%2F2021%2F0421%2F653dc07ej00qrx4s2001rc000zk00npm.jpg%26thumbnail%3D650x2147483647%26quality%3D80%26type%3Djpg%26refer%3Dhttp%3A%2F%2Fnimg.ws.126.net%26app%3D2002%26size%3Df9999%2C10000%26q%3Da80%26n%3D0%26g%3D0n%26fmt%3Dauto%3Fsec%3D1652147269%26t%3D3b6f6402feeee94190976426e98f9cb2&fromurl=ippr_z2C%24qAzdH3FAzdH3F1y_z%26e3B8mn_z%26e3Bv54AzdH3Fw6ptvsjAzdH3FGbcdKJJdacddJUUb_z%26e3Bip4s&gsm=1&rpstart=0&rpnum=0&islist=&querylist=&nojc=undefined
'''
# 要创建的文件夹
dirname = "banana_figs"
dirCreate(dirname)
# 链接保存的文本文件名
txtname = "fig_url.txt"
# 自动化引擎实例化, 这里需要改成你的浏览器驱动路径
s = Service('/opt/homebrew/msedgedriver')
driver = webdriver.Edge(service=s)
driver.get(url)
# 设置隐性等待
driver.implicitly_wait(15)
for i in range(1, 11):
data = driver.find_elements(
by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[1]/div/img')
print(i)
figurl = data[0].get_attribute("src")
saveUrl(txtname, figurl, i)
saveFig("%s.jpg" % str(i), figurl)
# 点击下一张
driver.find_element(
by=By.XPATH, value='/html/body/div[1]/div[2]/div/span[2]').click()
sleep(1)
driver.close()