李毅吧是一个比较大的百度贴吧了,里面的热门内容都是非常好的内容。所以,我就写了一个实现抓去里面热门帖子图片的脚步,现在分享出来给大家,希望大家一起进步学习:
import requests
from lxml import etree
from bs4 import BeautifulSoup
import threading
import time
import os
import re
import random
class tieba():
def __init__(self,tiebaname):
self.i=0
UserAgentlist = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101']
userage = random.choice(UserAgentlist)
self.headers = {'User-Agent': userage}
self.tiebaname=tiebaname
def res(self,url):
res = requests.get(url, headers=self.headers)
soup = BeautifulSoup(res.text, 'lxml')
return soup
def shouye(self):
for page in range(0, 2):
url = "https://tieba.baidu.com/f?kw=" + self.tiebaname + "&ie=utf-8&pn=" + str(page)
soup=self.res(url)
tti=soup.find_all('div',class_='t_con cleafix')
for title in tti:
title=str(title)
self.shuaixuan(title)
def shuaixuan(self,title):
zhiding=re.findall('<i alt=(.*?)</i>',title)
huifushu=re.findall('title="回复">(.*?)</span>',title)
url=re.findall('href="(.*?)"',title)
url=url[0]
zhiding=len(zhiding)
huifushu=int(huifushu[0])
if zhiding==0 and huifushu>100:
url='https://tieba.baidu.com'+url
print(url)
self.ziye(url)
def ziye(self,url):
soup = self.res(url)
title=soup.find('h3')
print(title)
if title is None:
title=soup.find('h1')
neirong=soup.find_all('div',class_='d_post_content j_d_post_content')
title=str(title)
title=re.findall('title="(.*?)"',title)
title=title[0]
title=title.strip('?')
title = title.strip()
print(title)
os.makedirs('/tiebaimgs/' + title + '/', exist_ok=True)
lujing = '/tiebaimgs/' + title+ '/'
self.i=0
for wen in neirong:
wen=str(wen)
imgs=re.findall('src="(.*?)"',wen)
if len(imgs)>0:
self.img_load(imgs,lujing)
def img_load(self,imgs,lujing):
for imgurl in imgs:
print(imgurl)
print(self.i)
if 'gsp0' in imgurl:
pass
else:
res=requests.get(url=imgurl,headers=self.headers)
with open(lujing+str(self.i)+'.jpg', 'wb') as f:
f.write(res.content)
time.sleep(1)
self.i+=1
if __name__ == '__main__':
liyi=tieba('李毅')
liyi.shouye()
下一篇: python如何写一个zblog采集程序
上一篇:python如何实现百度贴吧自动回复









评论