爬虫的简单操作
今天学习了python爬虫的简单操作。
1.学会创建文件夹和创建文件:
import os def mkdir(path): if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错 print("The path has already existed .") else: os.makedirs(path)###新建文件夹 print("Done .") def write(path,str): with open(path,"w+") as file:###写文件 file.write(str) def main(): mkdir("test") write("test/test.txt","hello world") if __name__=="__main__": main()
2.得到一个网站的源码(如果能够访问):
from bs4 import BeautifulSoup import requests def main(): html=requests.get("https://www.baidu.com")###去找这个网址 html.encoding="utf-8"###中文网址,换个字符集 soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式 print(soup.prettify())###将源码格式化(不是删数据) if __name__=="__main__": main()
3.得到一个网站的源码中相应标签的元素(如果能够访问):
import requests from bs4 import BeautifulSoup def write_to_file(content): with open("save.txt","a",encoding="utf-8") as f: f.write(content) def get_blog_info(url): html=requests.get(url) soup=BeautifulSoup(html.text,"lxml") print(soup.title)###各种各样的元素 print("="*100) print(type(soup.title)) print("="*100) print(type(soup.title.string)) print("="*100) print(soup.title.string) print("="*100) print(soup.head) print("="*100) print(soup.p) def main(): blog_url="https://www.cnblogs.com/sgh1023" get_blog_info(blog_url) if __name__=="__main__": main()
4.下载一个图片(如果能够访问):
import requests from bs4 import BeautifulSoup import os tot=0 path="save" def mkdir(path): if os.path.exists(path): return else: os.makedirs(path) def save(content): global tot,path mkdir(path) with open(path+"/"+str(tot)+".png","wb+") as file: file.write(content) file.close() tot=tot+1 def download_image(url):###下图片,不保证一定成功 print("Now downloading...",tot) response=requests.get(url) save(response.content) print("Done !") def main(): download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png") if __name__=="__main__": main()
5.下载一个网页的图片:
import requests import urllib import os from bs4 import BeautifulSoup tot=0 path="save" def mkdir(path): if os.path.exists(path): return else: os.makedirs(path) def save(content): global tot,path mkdir(path) with open(path+"/"+str(tot)+".png","wb+") as file: file.write(content) file.close() tot=tot+1 ###################################################################### def get_html_content(url):###获得网址的源码 req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。 req.add_header(‘user-agent‘,‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘) response=urllib.request.urlopen(req) html=response.read() return html def url_exist(url):###判断网址是否存在 try: response=requests.get(url) return True except: return False def download_image(url):###下图片 print("Now downloading...",tot,url) if(url_exist(url)):###判断网址是否存在 response=requests.get(url) save(response.content) print("Done !") else: print("Unavailable !") ###################################################################### def process(str):###简单地处理网址 if(str[0]==‘h‘): return str; elif(str[0]==‘/‘ and str[1]!=‘/‘): return "https:/"+str return "https:"+str; def get_image(url): soup=BeautifulSoup(get_html_content(url),"lxml") items=soup.find_all("img",{"src":True}) for i in items: download_image(process(i["src"])) def main(): url="https://www.bilibili.com" get_image(url) if __name__=="__main__": main()
当然,find_all的参数视具体情况而定。