Security_Code/信息收集/网站相关/批量获取网页存活与标题.py

75 lines
3.7 KiB
Python

# coding:utf-8
import requests
import random
import re
from concurrent.futures import ThreadPoolExecutor
def headers():
REFERERS = [
"https://www.baidu.com",
"http://www.baidu.com",
"https://www.google.com.hk",
"http://www.so.com",
"http://www.sogou.com",
"http://www.soso.com",
"http://www.bing.com",
]
headerss = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
headers = {
'User-Agent': random.choice(headerss),
'Accept': 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'referer': random.choice(REFERERS),
'Accept-Charset': 'GBK,utf-8;q=0.7,*;q=0.3',
}
return headers
def run(url):
print('Check url:'+url)
try:
r = requests.get(url,headers=headers(),allow_redirects=False,verify=False,timeout=10)
print(url + ':' + str(r.status_code))
encoding = requests.utils.get_encodings_from_content(r.text)[0]
res = r.content.decode(encoding,'replace')
title_pattern = '<title>(.*?)</title>'
title = re.search(title_pattern,res,re.S|re.I)
with open('result.txt','a+',encoding='utf-8')as a:
a.write(url + ':' + str(title.group(1)).strip() + '\n')
return True
except Exception as e:
print(e)
if '__main__' == __name__:
file_name = input('导入文本:')
tasks = list(set(x.strip() for x in open(file_name).readlines()))
print('总任务量:' + str(len(tasks)))
with ThreadPoolExecutor() as p: #类似打开文件,可省去.shutdown()
future_tasks = [p.submit(run, i) for i in tasks]
print('=' * 30)
print([obj.result() for obj in future_tasks])