最近一直在思考如何能做一个想百度一样的爬虫,能够自行的爬取网站里所有页面,并且排除掉非该网站下的链接。于是,就写了以下的这么个程序。由于本人对于线程还不是了解,刚开始写的初稿并没有加上线程,导致遇到数据多一些的网站,脚本运行时间就很长。这个其实是非常烦恼的,因为调试起来,脚本运行时间太长,会很耽误时间,所以找了个学习Python时在网上认识的一位朋友帮忙加了下线程,才有了以下这段代码:
import requests
from bs4 import BeautifulSoup
# 进程
from threading import Thread
import time
bbb=[]
jishu=0
def shouye():
global jishu
# url='http://www.ehstrain.cn/'
url=input("输入主域名:")
# url='http://www.ehstrain.cn'
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
r=requests.get(url,headers=headers)
soup=BeautifulSoup(r.content.decode('utf-8'),'lxml')
suoyoua=soup.find_all('a')
alla=[]
for lia in suoyoua:
dana=lia['href']
if dana.find('http')==-1 or dana.find(url) != -1:
alla.append(dana)
# 去重
alla=sorted(set(alla), key=alla.index)
# 开启多线程
t_list = []
for lianjie in alla:
for i in range(5):
t = Thread(target=neiye, args=(lianjie, url))
t_list.append(t)
t.start()
# 回收线程
for t in t_list:
t.join()
def neiye(lianjie,url):
global bbb
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
if lianjie.find(url)!=-1:
ciurl= lianjie
elif lianjie.find('http')==-1 and lianjie.find('/')!=-1:
ciurl=url + lianjie
else:
ciurl = url + '/' + lianjie
r = requests.get(ciurl , headers=headers)
# print(ciurl)
alla = []
try:
soup = BeautifulSoup(r.content.decode('utf-8'), 'lxml')
suoyoua = soup.find_all('a')
except:
print(ciurl)
else:
for lia in suoyoua:
try:
dana = lia['href']
except:
continue
if dana.find('http') == -1 or dana.find(url) != -1:
alla.append(dana)
# 去重
alla = sorted(set(alla), key=alla.index)
global jishu
for lian2 in alla:
if lian2 in bbb:
continue
else:
bbb.append(lian2)
neiye(lian2,url)
if __name__ == '__main__':
startime = time.time()
shouye()
bbb = sorted(set(bbb), key=bbb.index)
num=0
for ads in bbb:
if ads.find('http')!=-1:
print(num,ads)
num += 1
endtime = time.time()
thetime=endtime-startime
print(thetime)代码其实还有些问题,喜欢学习的朋友可以自己去调整,这里我就不放上最后的版本了。
下一篇: 移动网站如何优化图片?
上一篇:网站该如何优化,优化最终达到的目的是什么











评论