Python多线程Threading爬取图片,保存本地

注意:之前用过openpyxl库保存数据到Excel文件写入不了,换用xlsxwriter

import os
import requests
import re
from openpyxl import load_workbook
import xlsxwriter
from multiprocessing.dummy import Pool as ThreadPool

def spider(url):
    headers = {
        ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36‘}
    html = requests.get(url, headers, timeout=None)
    pic_url = re.findall(‘class="product-image">.*?<img src="(.*?)"  height="‘, html.text, re.S)
    sku = re.findall(‘q=(\d+)‘, url, re.S)#正则匹配链接后面的sku
    if pic_url != []:
        print(‘正在下载‘+sku[0]+‘图片,图片地址:‘ + pic_url[0])
        pic = requests.get(pic_url[0])
        dir = cwd + ‘\\images\\‘ + sku[0] + ‘.jpg‘
        # print(dir)
        with open(dir, ‘wb‘) as file:
            file.write(pic.content)
    else:
        if sku !=[]:
            print(‘没有找到‘ + sku[0]+‘产品‘)
            No_images.append(sku[0])
#之前用过openpyxl创建新的Excel文件,但是写入不了,之后换用xlsxwriter保存数据到Excel
def save_excel(sku):
    print(sku)
    wb1 = xlsxwriter.Workbook(cwd + ‘\\‘ + ‘No_images.xlsx‘)
    ws1 = wb1.add_worksheet()
    ws1.write(0, 0, ‘No_images_sku‘)
    for i in range(1,len(sku)+1):
        ws1.write(i, 0, sku[i-1])
    wb1.close()
    print(‘保存没有图片的sku成功!‘)

if __name__ == ‘__main__‘:
    cwd=os.getcwd()
    path = cwd + ‘\\‘+‘最近12个月没有销量产品(201711).xlsx‘
    wb =load_workbook(path)
    ws = wb.worksheets[0]
    pool =ThreadPool(50)#开启多少个进程,四核电脑
    urls = []
    No_images = []
    for i in range(1, ws.max_row+1):#通过循环将Excel数据读取出来
        sku = ws.cell(i,2).value
        if sku !=None:
            print(‘正在爬取第‘+str(i)+‘个sku图片‘)
            url = ‘http://www.fulchic.com/catalogsearch/result/?q=‘ + str(sku)
            urls.append(url)
    pool.map(spider,urls)#多线程工作,其中,spider是爬虫函数名,urls是个爬取链接列表
    pool.close()
    pool.join()
    # print(No_images)
    save_excel(No_images)

相关推荐