一个简单的百度图片爬虫(使用selenium)

 
Category: Python

写在前面

做点好玩的小程序, 使用selenium爬取百度图片, 主要思路是xpath选择器实现翻页以及读取下载链接, 相当方便的操作. 直接上代码:

# 用于设置强制等待
from time import sleep
import os
import requests
# 浏览器自动化引擎
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service


def dirCreate(dirname):
    if not os.path.exists(dirname):
        print("文件夹:%s不存在, 创建..." % dirname)
        os.makedirs(dirname)
    else:
        print("文件夹:%s已存在, 开始爬取..." % dirname)


def saveFig(figname, url2):
    with open(dirname + "/%s" % figname, "wb") as img:
        img.write(requests.get(url2).content)
        print("图片保存成功!")


def saveUrl(txtname, url1, id1):
    ret = str(id1) + "\t" + url1 + "\n"
    with open(dirname + "/%s" % txtname, "a", encoding="utf-8") as f:
        f.write(ret)
        print("链接保存成功!")


if __name__ == '__main__':

    # 待抓取的链接
    url = '''
    https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A6%99%E8%95%89&step_word=&hs=0&pn=2&spn=0&di=7060663421280190465&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=2&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs=3678626669%2C3982988379&os=1030740632%2C2370136271&simid=3547181550%2C510924326&adpicid=0&lpn=0&ln=1733&fr=&fmq=1649555264889_R&fm=detail&ic=0&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=&height=&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=https%3A%2F%2Fgimg2.baidu.com%2Fimage_search%2Fsrc%3Dhttp%3A%2F%2Fnimg.ws.126.net%2F%3Furl%3Dhttp%3A%2F%2Fdingyue.ws.126.net%2F2021%2F0421%2F653dc07ej00qrx4s2001rc000zk00npm.jpg%26thumbnail%3D650x2147483647%26quality%3D80%26type%3Djpg%26refer%3Dhttp%3A%2F%2Fnimg.ws.126.net%26app%3D2002%26size%3Df9999%2C10000%26q%3Da80%26n%3D0%26g%3D0n%26fmt%3Dauto%3Fsec%3D1652147269%26t%3D3b6f6402feeee94190976426e98f9cb2&fromurl=ippr_z2C%24qAzdH3FAzdH3F1y_z%26e3B8mn_z%26e3Bv54AzdH3Fw6ptvsjAzdH3FGbcdKJJdacddJUUb_z%26e3Bip4s&gsm=1&rpstart=0&rpnum=0&islist=&querylist=&nojc=undefined
    '''
    # 要创建的文件夹
    dirname = "banana_figs"
    dirCreate(dirname)
    # 链接保存的文本文件名
    txtname = "fig_url.txt"

    # 自动化引擎实例化, 这里需要改成你的浏览器驱动路径
    s = Service('/opt/homebrew/msedgedriver')
    driver = webdriver.Edge(service=s)
    driver.get(url)

    # 设置隐性等待
    driver.implicitly_wait(15)
    for i in range(1, 11):
        data = driver.find_elements(
            by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[1]/div/img')
        print(i)
        figurl = data[0].get_attribute("src")
        saveUrl(txtname, figurl, i)
        saveFig("%s.jpg" % str(i), figurl)

        # 点击下一张
        driver.find_element(
            by=By.XPATH, value='/html/body/div[1]/div[2]/div/span[2]').click()
        sleep(1)
    driver.close()