爬虫的简单操作

今天学习了python爬虫的简单操作。

1.学会创建文件夹和创建文件:

import os

def mkdir(path):
    if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错
        print("The path has already existed .")
    else:
        os.makedirs(path)###新建文件夹
        print("Done .")

def write(path,str):
    with open(path,"w+") as file:###写文件
        file.write(str)

def main():
    mkdir("test")
    write("test/test.txt","hello world")

if __name__=="__main__":
    main()

2.得到一个网站的源码(如果能够访问):

from bs4 import BeautifulSoup
import requests

def main():
    html=requests.get("https://www.baidu.com")###去找这个网址
    html.encoding="utf-8"###中文网址,换个字符集
    soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式
    print(soup.prettify())###将源码格式化(不是删数据)

if __name__=="__main__":
    main()

3.得到一个网站的源码中相应标签的元素(如果能够访问):

import requests
from bs4 import BeautifulSoup

def write_to_file(content):
    with open("save.txt","a",encoding="utf-8") as f:
        f.write(content)

def get_blog_info(url):
    html=requests.get(url)
    soup=BeautifulSoup(html.text,"lxml")
    print(soup.title)###各种各样的元素
    print("="*100)
    print(type(soup.title))
    print("="*100)
    print(type(soup.title.string))
    print("="*100)
    print(soup.title.string)
    print("="*100)
    print(soup.head)
    print("="*100)
    print(soup.p)

def main():
    blog_url="https://www.cnblogs.com/sgh1023"
    get_blog_info(blog_url)

if __name__=="__main__":
    main()

4.下载一个图片(如果能够访问):

import requests
from bs4 import BeautifulSoup
import os

tot=0
path="save"

def mkdir(path):
    if os.path.exists(path):
        return
    else:
        os.makedirs(path)

def save(content):
    global tot,path
    mkdir(path)
    with open(path+"/"+str(tot)+".png","wb+") as file:
        file.write(content)
        file.close()
        tot=tot+1

def download_image(url):###下图片,不保证一定成功
    print("Now downloading...",tot)
    response=requests.get(url)
    save(response.content)
    print("Done !")

def main():
    download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png")

if __name__=="__main__":
    main()

5.下载一个网页的图片:

import requests
import urllib
import os
from bs4 import BeautifulSoup

tot=0
path="save"

def mkdir(path):
    if os.path.exists(path):
        return
    else:
        os.makedirs(path)

def save(content):
    global tot,path
    mkdir(path)
    with open(path+"/"+str(tot)+".png","wb+") as file:
        file.write(content)
        file.close()
        tot=tot+1
######################################################################
def get_html_content(url):###获得网址的源码
    req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
    req.add_header(‘user-agent‘,‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘)
    response=urllib.request.urlopen(req)
    html=response.read()
    return html

def url_exist(url):###判断网址是否存在
    try:
        response=requests.get(url)
        return True
    except:
        return False

def download_image(url):###下图片
    print("Now downloading...",tot,url)
    if(url_exist(url)):###判断网址是否存在
        response=requests.get(url)
        save(response.content)
        print("Done !")
    else:
        print("Unavailable !")
######################################################################
def process(str):###简单地处理网址
    if(str[0]==‘h‘):
        return str;
    elif(str[0]==‘/‘ and str[1]!=‘/‘):
        return "https:/"+str
    return "https:"+str;

def get_image(url):
    soup=BeautifulSoup(get_html_content(url),"lxml")
    items=soup.find_all("img",{"src":True})
    for i in items:
        download_image(process(i["src"]))

def main():
    url="https://www.bilibili.com"
    get_image(url)

if __name__=="__main__":
    main()

 当然,find_all的参数视具体情况而定。