爬虫的简单操作
今天学习了python爬虫的简单操作。
1.学会创建文件夹和创建文件:
import os
def mkdir(path):
if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错
print("The path has already existed .")
else:
os.makedirs(path)###新建文件夹
print("Done .")
def write(path,str):
with open(path,"w+") as file:###写文件
file.write(str)
def main():
mkdir("test")
write("test/test.txt","hello world")
if __name__=="__main__":
main()2.得到一个网站的源码(如果能够访问):
from bs4 import BeautifulSoup
import requests
def main():
html=requests.get("https://www.baidu.com")###去找这个网址
html.encoding="utf-8"###中文网址,换个字符集
soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式
print(soup.prettify())###将源码格式化(不是删数据)
if __name__=="__main__":
main()3.得到一个网站的源码中相应标签的元素(如果能够访问):
import requests
from bs4 import BeautifulSoup
def write_to_file(content):
with open("save.txt","a",encoding="utf-8") as f:
f.write(content)
def get_blog_info(url):
html=requests.get(url)
soup=BeautifulSoup(html.text,"lxml")
print(soup.title)###各种各样的元素
print("="*100)
print(type(soup.title))
print("="*100)
print(type(soup.title.string))
print("="*100)
print(soup.title.string)
print("="*100)
print(soup.head)
print("="*100)
print(soup.p)
def main():
blog_url="https://www.cnblogs.com/sgh1023"
get_blog_info(blog_url)
if __name__=="__main__":
main()4.下载一个图片(如果能够访问):
import requests
from bs4 import BeautifulSoup
import os
tot=0
path="save"
def mkdir(path):
if os.path.exists(path):
return
else:
os.makedirs(path)
def save(content):
global tot,path
mkdir(path)
with open(path+"/"+str(tot)+".png","wb+") as file:
file.write(content)
file.close()
tot=tot+1
def download_image(url):###下图片,不保证一定成功
print("Now downloading...",tot)
response=requests.get(url)
save(response.content)
print("Done !")
def main():
download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png")
if __name__=="__main__":
main()5.下载一个网页的图片:
import requests
import urllib
import os
from bs4 import BeautifulSoup
tot=0
path="save"
def mkdir(path):
if os.path.exists(path):
return
else:
os.makedirs(path)
def save(content):
global tot,path
mkdir(path)
with open(path+"/"+str(tot)+".png","wb+") as file:
file.write(content)
file.close()
tot=tot+1
######################################################################
def get_html_content(url):###获得网址的源码
req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
req.add_header(‘user-agent‘,‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘)
response=urllib.request.urlopen(req)
html=response.read()
return html
def url_exist(url):###判断网址是否存在
try:
response=requests.get(url)
return True
except:
return False
def download_image(url):###下图片
print("Now downloading...",tot,url)
if(url_exist(url)):###判断网址是否存在
response=requests.get(url)
save(response.content)
print("Done !")
else:
print("Unavailable !")
######################################################################
def process(str):###简单地处理网址
if(str[0]==‘h‘):
return str;
elif(str[0]==‘/‘ and str[1]!=‘/‘):
return "https:/"+str
return "https:"+str;
def get_image(url):
soup=BeautifulSoup(get_html_content(url),"lxml")
items=soup.find_all("img",{"src":True})
for i in items:
download_image(process(i["src"]))
def main():
url="https://www.bilibili.com"
get_image(url)
if __name__=="__main__":
main()当然,find_all的参数视具体情况而定。