Python多线程Threading爬取图片,保存本地
注意:之前用过openpyxl库保存数据到Excel文件写入不了,换用xlsxwriter
import os
import requests
import re
from openpyxl import load_workbook
import xlsxwriter
from multiprocessing.dummy import Pool as ThreadPool
def spider(url):
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36‘}
html = requests.get(url, headers, timeout=None)
pic_url = re.findall(‘class="product-image">.*?<img src="(.*?)" height="‘, html.text, re.S)
sku = re.findall(‘q=(\d+)‘, url, re.S)#正则匹配链接后面的sku
if pic_url != []:
print(‘正在下载‘+sku[0]+‘图片,图片地址:‘ + pic_url[0])
pic = requests.get(pic_url[0])
dir = cwd + ‘\\images\\‘ + sku[0] + ‘.jpg‘
# print(dir)
with open(dir, ‘wb‘) as file:
file.write(pic.content)
else:
if sku !=[]:
print(‘没有找到‘ + sku[0]+‘产品‘)
No_images.append(sku[0])
#之前用过openpyxl创建新的Excel文件,但是写入不了,之后换用xlsxwriter保存数据到Excel
def save_excel(sku):
print(sku)
wb1 = xlsxwriter.Workbook(cwd + ‘\\‘ + ‘No_images.xlsx‘)
ws1 = wb1.add_worksheet()
ws1.write(0, 0, ‘No_images_sku‘)
for i in range(1,len(sku)+1):
ws1.write(i, 0, sku[i-1])
wb1.close()
print(‘保存没有图片的sku成功!‘)
if __name__ == ‘__main__‘:
cwd=os.getcwd()
path = cwd + ‘\\‘+‘最近12个月没有销量产品(201711).xlsx‘
wb =load_workbook(path)
ws = wb.worksheets[0]
pool =ThreadPool(50)#开启多少个进程,四核电脑
urls = []
No_images = []
for i in range(1, ws.max_row+1):#通过循环将Excel数据读取出来
sku = ws.cell(i,2).value
if sku !=None:
print(‘正在爬取第‘+str(i)+‘个sku图片‘)
url = ‘http://www.fulchic.com/catalogsearch/result/?q=‘ + str(sku)
urls.append(url)
pool.map(spider,urls)#多线程工作,其中,spider是爬虫函数名,urls是个爬取链接列表
pool.close()
pool.join()
# print(No_images)
save_excel(No_images) 相关推荐
up0 2020-06-13
sschencn 2019-12-27
三石 2019-04-09
HMHYY 2019-03-17
一叶不知秋 2018-12-18
wangyaqi 2018-07-16
elizabethxxy 2018-06-11
linkequa 2018-06-11
fanhuasijin 2018-05-04
wwzhang00 2018-05-03
小海 2018-05-03
CandyGL 2018-04-24
liusarazhang 2018-04-24
木心 2018-03-25
jibkfv 2018-03-16
cassiePython 2019-03-11
pythoncream 2019-03-11
admans 2018-12-12