BeautifulSoup搜索html文档

学习资料 1K+ 3年前 0 云创动力

一、BeautifulSoup搜索html文档
1、使用class_属性搜索html的class属性文档，返回节点的列表
xxx=bs.find_all(class_=’属性值’)
示例如下：获取天涯社区的天涯杂谈板块的内容信息
from bs4 import BeautifulSoup
import urllib.request

获取html元素对象

html=urllib.request.urlopen("http://bbs.tianya.cn/list-free-1.shtml")

获取beautifulSoup的对象

bs=BeautifulSoup(html)

获取要搜索的属性对象,返回节点列表

tags=bs.find_all(class_="td-title faceblue")

for mytag in tags:
#print(type(mytag))
#print(type(mytag.a))
#拼接内容页面的url地址
myurl="http://bbs.tianya.cn"+mytag.a['href']
print("url:",myurl)

#获取标题文本内容
print("title:", mytag.a.text.strip())

#根据内容页面的URL地址获取内容文本信息
html1=urllib.request.urlopen(myurl)
print("内容为:")
bs1=BeautifulSoup(html1)
mydivs1=bs1.find_all(class_="bbs-content clearfix")

#只获取楼主发表的第一条信息内容
print(mydivs1[0].text.strip())
print("=======================")

使用beautifulSoup下载壁纸图片项目实践
from bs4 import BeautifulSoup
import os
import urllib.request

html=urllib.request.urlopen("http://www.netbian.com/index.htm")
bs=BeautifulSoup(html,"html.parser")

listdiv=bs.find_all("div",class_="list")
for imgdiv in listdiv:
listimg=imgdiv.find_all("img")
for myimg in listimg:
#获取图片的标题
title=myimg['alt']
title=title.replace("/","_")
print("标题:", title)

    #获取图片的链接地址
    imgurl=myimg['src']
    print("img url:",imgurl)

    #获取图片的内容
    pic=urllib.request.urlopen(imgurl)
    piccontent=pic.read()#读取图片内容字节
    #print(piccontent)

    #将图片信息写入到文件
    f=open(title+".jpg","wb")
    f.write(piccontent)#将图片内容以二进制模式写入到文件
    f.close()