分页爬取壁纸站点的图片
一、分页爬取壁纸站点的图片
示例如下:
from bs4 import BeautifulSoup
import os
import urllib.request
import time
import urllib.request
from bs4 import BeautifulSoup
import os
根据指定的壁纸url地址下载网页中的图片
def writeImageUrl(urlstr):
#html = urllib.request.urlopen("http://www.netbian.com/weimei/index.htm")
html = urllib.request.urlopen(urlstr)
bs = BeautifulSoup(html, 'html.parser')
# print(bs)
list = bs.find_all("div", class_="list")
for u in list:
listimg = u.find_all("img")
for im in listimg:
print(im['src']) # 获取要解析的标签属性值,这里是获取img标签的src属性值
print(im['alt'])
print("------------")
imgbyte = urllib.request.urlopen(im['src']) # 根据img的src属性值获取图片
fname = im['alt'] # 获取图片名称
# print(type(fname))
fname = fname.replace("/", " ") # 将文件中的/替换为空格
f = open("d:\\jpg\\" + fname + ".jpg", "wb") # 将下载的壁纸存放在d\jpg目录下,wb模式是以二进制方式写入文档
f.write(imgbyte.read())
f.close()
def geturl():
urlstr ="";
for i in range(5):
j=i+1;
if j==1:
urlstr="http://www.netbian.com/weimei/index.htm"
else:
urlstr = "http://www.netbian.com/weimei/index_"+str(j)+".htm"
print(urlstr)
writeImageUrl(urlstr)
time.sleep(2) #每过2秒钟爬取一次
if name == '__main__':
geturl()
天涯杂谈站点抓取实践
import bs4
from bs4 import BeautifulSoup
import os
import urllib.request
import time
def gettianyanews(urlstr):
html=urllib.request.urlopen(urlstr)
bs=BeautifulSoup(html,"html.parser")
mytds=bs.find_all("td",class_="td-title faceblue")
for mytd in mytds:
print("title:",mytd.a.text.strip())
print("url:",mytd.a['href'])
print("=====================================")
查询分页的url地址
def getpageurl(urlstr,stratpagenum):
stratpagenum = stratpagenum + 1
urlfull="http://bbs.tianya.cn"
html = urllib.request.urlopen(urlstr)
bs = BeautifulSoup(html, "html.parser")
linkdivs=bs.find_all("div",class_="links")
for u in linkdivs:
for lista in u:
#print(lista)
#print(type(lista))
if type(lista)== bs4.element.Tag :#判断标记的类型是否为Tag标签
#print(lista.text)
if lista.text.strip() == '下一页':
print(lista.text.strip())
print(lista['href'])
urlfull=urlfull+lista['href']
print(urlfull)
print("stratpagenum=",stratpagenum)
if stratpagenum > 10:
return;
getpageurl(urlfull,stratpagenum)
gettianyanews(urlfull) #获取分页的标题信息
print("===============")
if name == '__main__':
getpageurl("http://bbs.tianya.cn/list-free-1.shtml",1)
效果如下:
下一页
/list.jsp?item=free&nextid=1604887076000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604887076000
stratpagenum= 2
下一页
/list.jsp?item=free&nextid=1604881547000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604881547000
stratpagenum= 3
下一页
/list.jsp?item=free&nextid=1604851796000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604851796000
stratpagenum= 4
下一页
/list.jsp?item=free&nextid=1604833340000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604833340000
stratpagenum= 5
下一页
/list.jsp?item=free&nextid=1604811503000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604811503000
stratpagenum= 6
下一页
/list.jsp?item=free&nextid=1604758611000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604758611000
stratpagenum= 7
下一页
/list.jsp?item=free&nextid=1604736453000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604736453000
stratpagenum= 8
下一页
/list.jsp?item=free&nextid=1604714628000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604714628000
stratpagenum= 9
下一页
/list.jsp?item=free&nextid=1604667345000
http://bbs.tianya.cn/list.jsp?item=free&nextid=1604667345000
stratpagenum= 10
标题显示:
title: 前世今生之:我与小白龟的狭路夫妻缘
url: /post-free-6163292-1-1.shtml
title: 从“除四害”到“禁摩限电”-又见典型中国式决策
url: /post-free-5435197-1.shtml
title: 世上本没有不平事,是善变的心失去了平衡。
url: /post-free-6163290-1.shtml
title: 柏林墙,泪与笑凝成的自由故事(上)
url: /post-free-5700336-1.shtml
title: 维权永远在路上
url: /post-free-6163287-1.shtml
title: 你现在所做的事情是你喜欢的吗
url: /post-free-6163286-1.shtml
title: 美女与乱伦
url: /post-free-335062-1.shtml
.....
评论留言