Security_Code/COMMON_CODE半成品-不再更新/获取网页中的可以注入的链接5.py

687 lines
31 KiB
Python

# -*- coding:utf-8 -*-
#__author__:langzi
#__blog__:www.langzi.fun
import contextlib
import pymysql
import re
import time
import requests
requests.packages.urllib3.disable_warnings()
from bs4 import BeautifulSoup
from urllib.parse import urlparse,urljoin
import random
from concurrent.futures import ThreadPoolExecutor
user = 'root'
passwd = 'root'
host = '127.0.0.1'
Dbname = 'langzi_scan_1'
port = '3306'
thread_s = 16
scan_level_s = 1
@contextlib.contextmanager
def connect_mysql():
coon = pymysql.connect(user=user, passwd=passwd, host=host, db=Dbname, port=port, charset='utf8')
cursor = coon.cursor()
try:
yield cursor
except Exception as e:
if '1062, "Duplicate entry ' in str(e):
print('该网址重复')
pass
finally:
coon.commit()
cursor.close()
coon.close()
import urllib.parse,os.path,re
class filter_url:
def __init__(self):
self.list_url_static=[]
def filter_url(self,url):
url=urllib.parse.urlparse(url)
if url.query!='':
return (self.params_filter(url))
pass
elif url.query=='':
self.static_filter(url)
elif url.path=='':
return (url)
def static_filter(self,url):
#伪静态与url路径处理
urls=os.path.splitext(url.path)
if urls[1]!='':
list_url=[]
for i in urls[0].split('/'):
if i!='':list_url.append('{%s:%s}'%(self.judgetype(i),len(i)))
url_path="/".join(list_url)
return (url.scheme + '://' + url.netloc +'/'+ url_path + urls[1])
else:
list_url=[]
for i in url.path.split('/'):
if i!='':list_url.append('{%s:%s}'%(self.judgetype(i),len(i)))
url_path="/".join(list_url)
return (url.scheme + '://' + url.netloc +'/'+ url_path)
def params_filter(self,url):
#url参数处理
liststr = []
try:
liststr = []
for i in url.query.split('&'):
para = i.split('=')
length_int = len(para[1])
if self.judgetype(para[1]) == 'int':
para[1] = '{int:%s}' % length_int
else:
para[1] = '{str:%s}' % length_int
para = '='.join(para)
liststr.append(para)
url_paras='&'.join(liststr)
return url.scheme + '://' + url.netloc + url.path + '?' + url_paras
except:
length_int = len(url.query)
url_paras = '{'+self.judgetype(url.query) + ':%s}' % length_int
return url.scheme + '://' + url.netloc + url.path + '?' + url_paras
def callback_content(self,content):
ret = re.split(r'-|_|\.',content)
def judgetype(self, strs):
try:
int(strs)
return 'int'
except:
return 'str'
headerss = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
import random
class Get_Links:
def __init__(self,url):
self.url = url
self.headers = {'User-Agent':random.choice(headerss)}
self.timeout = 15
# self.domain = urlparse(self.url).netloc.replace('www.','').replace('/','').replace('.com.cn','').replace('.org.cn','').replace('.net.cn','').replace('.com','').replace('.cn','').replace('.cc','').replace('.net','').replace('.org','').replace('.info','').replace('.fun','').replace('.one','').replace('.xyz','').replace('.name','').replace('.io','').replace('.top','').replace('.me','').replace('.club','').replace('.tv','')
# if self.domain.find('.')>0:
# self.domain = self.domain.split('.')[1]
self.domain0 = urlparse(self.url).netloc.replace('/', '')
self.domain = self.domain0.replace('www.','')
self.domain = self.domain0
self.result = {}
self.all_links = []
# 一开始获取所有的链接
self.id_links = []
# 带参数的动态链接
self.html_links = []
# 静态网页
self.sche = 'http://'
if 'http://' in self.url:
self.sche = 'http://'
else:
self.sche = 'https://'
self.mids = set()
def Write_Logs(self,content):
# 写入日志文件
# 传入参数为字符串类型
with open('log.txt', 'a+')as aa:
aa.write('*********************************************' + '\n')
aa.write(str(time.strftime('%Y-%m-%d:%H:%M:%S ', time.localtime())) + str(content) + '\n')
def Request(self,url):
# 发起请求
# 传入参数为url
# 返回结果为没有编码的结果
try:
time.sleep(random.randint(1,5))
r = requests.get(url=url,headers=self.headers,timeout=self.timeout,verify=False)
# print('首次运行存活检测URL : {} 状态 : {}'.format(url, r.status_code))
#encoding = requests.utils.get_encodings_from_content(r.text)[0]
#res = r.content.decode(encoding,'replace')
return r.content
except Exception as e:
self.Write_Logs(str(e))
print('该网址访问失败:'+url)
with connect_mysql() as coon:
sql1 = 'insert into Sec_Fail_Links(url) values ("{}")'.format(url.rstrip('/'))
coon.execute(sql1)
def extract_URL(self,content):
pattern_raw = r"""
(?:"|') # Start newline delimiter
(
((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or //
[^"'/]{1,}\. # Match a domainname (any character + dot)
[a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path
|
((?:/|\.\./|\./) # Start with /,../,./
[^"'><,;| *()(%%$^/\\\[\]] # Next character can't be...
[^"'><,;|()]{1,}) # Rest of the characters can't be
|
([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with /
[a-zA-Z0-9_\-/]{1,} # Resource name
\.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action)
(?:[\?|/][^"|']{0,}|)) # ? mark with parameters
|
([a-zA-Z0-9_\-]{1,} # filename
\.(?:php|asp|aspx|jsp|json|
action|html|js|txt|xml) # . + extension
(?:\?[^"|']{0,}|)) # ? mark with parameters
)
(?:"|') # End newline delimiter
"""
pattern = re.compile(pattern_raw, re.VERBOSE)
result = re.finditer(pattern, str(content))
if result == None:
return None
js_url = []
for match in result:
if match.group() not in js_url:
js_url.append(match.group().strip('"').strip("'"))
return js_url
def process_url(self, re_URL):
black_url = ["javascript:"] # Add some keyword for filter url.
URL_raw = urlparse(self.url)
ab_URL = URL_raw.netloc
host_URL = URL_raw.scheme
if re_URL[0:2] == "//":
result = host_URL + ":" + re_URL
elif re_URL[0:4] == "http":
result = re_URL
elif re_URL[0:2] != "//" and re_URL not in black_url:
if re_URL[0:1] == "/":
result = host_URL + "://" + ab_URL + re_URL
else:
if re_URL[0:1] == ".":
if re_URL[0:2] == "..":
result = host_URL + "://" + ab_URL + re_URL[2:]
else:
result = host_URL + "://" + ab_URL + re_URL[1:]
else:
result = host_URL + "://" + ab_URL + "/" + re_URL
else:
result = self.url
return result
def find_last(string, str):
positions = []
last_position = -1
while True:
position = string.find(str, last_position + 1)
if position == -1: break
last_position = position
positions.append(position)
return positions
def find_by_url(self,js=False):
if js == False:
html_raw = self.Extract_html(self.url)
if html_raw == None:
# print("Fail to access " + self.url)
return None
# print(html_raw)
html = BeautifulSoup(html_raw, "html.parser")
html_scripts = html.findAll("script")
script_array = {}
script_temp = ""
for html_script in html_scripts:
script_src = html_script.get("src")
if script_src == None:
script_temp += html_script.get_text() + "\n"
else:
purl = self.process_url(self.url, script_src)
script_array[purl] = self.Extract_html(purl)
script_array[self.url] = script_temp
allurls = []
for script in script_array:
# print(script)
temp_urls = self.extract_URL(script_array[script])
if len(temp_urls) == 0: continue
for temp_url in temp_urls:
allurls.append(self.process_url(script, temp_url))
result = []
for singerurl in allurls:
url_raw = urlparse(self.url)
domain = url_raw.netloc
positions = self.find_last(domain, ".")
miandomain = domain
if len(positions) > 1: miandomain = domain[positions[-2] + 1:]
# print(miandomain)
suburl = urlparse(singerurl)
subdomain = suburl.netloc
# print(singerurl)
if miandomain in subdomain or subdomain.strip() == "":
if singerurl.strip() not in result:
result.append(singerurl)
return result
else:
temp_urls = self.extract_URL(self.Request(self.url))
if len(temp_urls) == 0: return None
result = []
for temp_url in temp_urls:
if temp_url not in result:
result.append(temp_url)
return result
def Get_All_Links(self,url):
content = self.Request(url)
if content == None:
content = self.Request(url)
if content == None:
return None
_links = []
# 接收参数为网页的内容
# 返回结果为网页中全部的链接
# 包括动态链接和静态网址和目录
soup = BeautifulSoup(content, 'html.parser',from_encoding='iso-8859-1')
links = soup.findAll('a')
if links != None:
for link in links:
_url = link.get('href')
res = re.search('(javascript|:;|#|%)', str(_url))
res1 = re.search('.(jpg|png|gif|jpeg|mp4|css|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|\.w3\.org)', str(_url))
if res == None and res1 == None:
_links.append(str(_url).replace(r'\\','').rstrip('\\'))
else:
pass
links2 = self.find_by_url(self.url)
if links2 != None:
for link in links2:
res = re.search('(javascript|:;|#|%)', str(link))
res1 = re.search('.(jpg|png|gif|jpeg|mp4|css|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|\.w3\.org)', str(link))
if res == None and res1 == None:
_links.append(str(link).replace(r'\\','').rstrip('\\'))
else:
pass
if _links != []:
return _links
else:
return None
def Get_Dir_Links(self,content):
# 对上面那个获取所有链接进行整理
dir_links = []
if content != None:
rst = list(set(content))
for rurl in rst:
if rurl.startswith('http') and '://' in rurl and self.domain in rurl:
# http://www.baidu.com
if rurl.rstrip('/') != self.url:
dir_links.append(rurl.strip())
if 'http' not in rurl and self.domain in rurl:
if 'www' in self.url:
if 'www' in rurl:
dir_links.append(self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
else:
dir_links.append(self.sche + 'www.'+rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
else:
dir_links.append(
self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//', '').replace(
':', ''))
if 'http' not in rurl and self.domain not in rurl and ':' not in rurl and '//' not in rurl:
# /sttd/xhm/
dir_links.append(self.sche + self.domain0 + '/' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
if rurl.startswith('://') and 'http' not in rurl and self.domain in rurl:
if self.sche + rurl.replace('://','').rstrip('/') != self.url:
dir_links.append(self.sche + rurl.replace('://',''))
if rurl.startswith('//') and self.domain in rurl :
# //order.jd.com/center/list.action
if self.sche + rurl.replace('//','').rstrip('/') != self.url:
dir_links.append(self.sche + rurl.replace('//',''))
dir_links = list(set(dir_links))
html_links = []
no_html_links = []
for i in dir_links:
if 'htm' in i:
html_links.append(i)
else:
no_html_links.append(i)
if len(html_links) > 30:
html_links = random.sample(html_links,10)
dir_links = html_links + no_html_links
# print(dir_links)
if len(dir_links) > 100:
dir_links = random.sample(dir_links,60)
if dir_links != []:
return dir_links
else:
return None
def Get_Ht_Id_Links(self,content):
# 接受的参数content 是列表
# 返回结果是一个字典
id_links = []
html_links = []
ht_id_result = {'ID':id_links,'HT':html_links}
if content != None:
rst = list(set(content))
for rurl in rst:
if rurl.startswith('http') and '://' in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
# http://www.baidu.com
if '?' in rurl and '=' in rurl:
# result_links.append(rurl)
id_links.append(rurl.strip())
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(rurl.strip())
if 'http' not in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
if 'www' in self.url:
if 'www' in rurl:
if '?' in rurl and '=' in rurl:
id_links.append(self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(
self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//',
'').replace(
':', ''))
else:
if '?' in rurl and '=' in rurl:
id_links.append(self.sche + 'www.'+rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(
self.sche + 'www.' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip(
'.').replace('//', '').replace(':', ''))
else:
if '?' in rurl and '=' in rurl:
id_links.append(
self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//', '').replace(
':', ''))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//', '').replace(
':', ''))
if 'http' not in rurl and self.domain not in rurl and ':' not in rurl and '//' not in rurl and '.js?' not in rurl and '.min.js' not in rurl:
# /sttd/xhm/
if '?' in rurl and '=' in rurl:
id_links.append(self.sche + self.domain0 + '/' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(
self.sche + self.domain0 + '/' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip(
'.').replace('//', '').replace(':', ''))
if rurl.startswith('://') and 'http' not in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
if '?' in rurl and '=' in rurl:
id_links.append(self.sche + rurl.replace('://',''))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(self.sche + rurl.replace('://',''))
if rurl.startswith('//') and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
# //order.jd.com/center/list.action
if '?' in rurl and '=' in rurl:
id_links.append(self.sche + rurl.replace('//',''))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(self.sche + rurl.replace('//',''))
if '//' in rurl and rurl.startswith('http') and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
# http // domain 都在
# https://www.yamibuy.com/cn/search.php?tags=163
# http://news.hnu.edu.cn/zhyw/2017-11-11/19605.html
if '?' in rurl and '=' in rurl:
# result_links.append(rurl)
id_links.append(rurl.strip())
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
# result_links.append(rurl)
html_links.append(rurl.strip())
# //wmw.dbw.cn/system/2018/09/25/001298805.shtml
if 'http' not in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
# http 不在 domain 在
if '?' in rurl and '=' in rurl:
id_links.append(self.sche + rurl.lstrip('/').lstrip('.').strip().lstrip('/'))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
html_links.append(self.sche + rurl.lstrip('/').lstrip('.').strip().lstrip('/'))
# /chanpin/2018-07-12/3.html"
if 'http' not in rurl and self.domain not in rurl and '.js?' not in rurl and '.min.js' not in rurl:
# http 不在 domain 不在
if '?' in rurl and '=' in rurl:
id_links.append(self.sche + self.domain0.strip() + '/' + rurl.strip().lstrip('/').lstrip('.').lstrip('/'))
if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
if '?' not in rurl:
html_links.append(self.sche + self.domain0.strip() + '/' + rurl.strip().lstrip('/').lstrip('.').lstrip('/'))
if len(html_links)>100:
html_links=random.sample(html_links,50)
if len(id_links)>100:
id_links = random.sample(id_links,50)
ht_id_result['ID'] = list(set(id_links))
ht_id_result['HT'] = list(set(html_links))
return ht_id_result
def Filter(self,par1,lis1):
for i in lis1:
try:
#print('开始对比 {}:::::::::::::::::{}'.format(par1.encode(),i.encode()))
res = re.search(par1.encode(),i.encode())
if res:
#print('发现对比存活!!!!!!!!')
return i
except Exception as e:
#print(e)
self.Write_Logs(str(e))
def Get_Result(self):
Link = self.Get_All_Links(self.url)
print('--------第一次获取 {} 数据----------'.format(self.url))
print('所有链接:{}'.format(len(Link)))
print(Link)
if Link:
self.html_links.extend(self.Get_Ht_Id_Links(Link).get('HT'))
print('静态链接:{}'.format(len(self.html_links)))
print(self.html_links)
self.id_links.extend(self.Get_Ht_Id_Links(Link).get('ID'))
print('动态链接:{}'.format(len(self.id_links)))
print(self.id_links)
Dirs = self.Get_Dir_Links(Link)
if Dirs:
print('目录链接:{}'.format(len(Dirs)))
print(Dirs)
for dir in Dirs:
Links = self.Get_All_Links(dir)
if Links:
self.html_links.extend(self.Get_Ht_Id_Links(Links).get('HT'))
self.id_links.extend(self.Get_Ht_Id_Links(Links).get('ID'))
self.html_links = list(set(self.html_links))
self.id_links = list(set(self.id_links))
print('--------第二次获取 {} 数据----------'.format(self.url))
print('静态链接:{}'.format(len(self.html_links)))
print(self.html_links)
print('动态链接:{}'.format(len(self.id_links)))
print(self.id_links)
idido = []
htht = list(set(self.html_links))
hthtx = []
dic_1 = []
dic_2 = []
dic_3 = []
dic_4 = []
for i in htht:
path = urlparse(i).path
if path.count('/') == 1:
dic_1.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
if path.count('/') == 2:
dic_2.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
if path.count('/') == 3:
dic_3.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
if path.count('/') > 3:
dic_4.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
if dic_1:
hthtx.append(random.choice(dic_1))
hthtx.append(random.choice(dic_1))
#hthtx.append(random.choice(dic_1))
if dic_2:
hthtx.append(random.choice(dic_2))
hthtx.append(random.choice(dic_2))
#hthtx.append(random.choice(dic_2))
if dic_3:
hthtx.append(random.choice(dic_3))
hthtx.append(random.choice(dic_3))
#hthtx.append(random.choice(dic_3))
if dic_4:
hthtx.append(random.choice(dic_4))
hthtx.append(random.choice(dic_4))
#hthtx.append(random.choice(dic_4))
#self.html_links = hthtx
#print(self.html_links)
#print('静态页面数:{}'.format(len(hthtx)))
p = filter_url()
for i in self.id_links:
mid = str(re.sub('{.*?}','.*?',str(p.filter_url(i))))
if mid in self.mids:
pass
else:
self.mids.add(mid)
idido.append(i)
idido = list(set(idido))
if len(idido)>100:
idido = random.sample(idido,100)
print('--------第三次获取 {} 数据----------'.format(self.url))
print('静态链接:{}'.format(len(hthtx)))
print(self.html_links)
print('动态链接:{}'.format(len(idido)))
print(self.id_links)
ididz = []
hthtz = []
for i in idido:
time.sleep(random.randint(2,5))
try:
r = requests.get(url=i,headers=self.headers,timeout=self.timeout,verify=False)
#print('最终运行存活检测URL : {} 状态 : {}'.format(i,r.status_code))
if r.status_code == 200 or r.status_code == 302 or r.status_code == 301:
ididz.append(i.replace('\n',''))
if '?' in r.url and '=' in r.url:
ididz.append(r.url.replace('\n','').replace(';','').replace(',','').replace(' ',''))
except Exception as e:
self.Write_Logs(str(e))
with connect_mysql() as coon:
print('访问失败:{}'.format(i))
sql1 = 'insert into Sec_Fail_Links(url) values ("{}")'.format(i)
coon.execute(sql1)
for i in hthtx:
time.sleep(random.randint(2,5))
try:
r = requests.get(url=i.replace('*',''),headers=self.headers,timeout=self.timeout,verify=False)
#print('最终运行存活检测URL : {} 状态 : {}'.format(i,r.status_code))
if r.status_code == 200 or r.status_code == 302 or r.status_code == 301:
hthtz.append(i.replace('\n','').replace(';','').replace(',','').replace(' ',''))
except Exception as e:
self.Write_Logs(str(e))
with connect_mysql() as coon:
print('访问失败:{}'.format(i))
sql1 = 'insert into Sec_Fail_Links(url) values ("{}")'.format(i)
coon.execute(sql1)
ididt = list(set(ididz))
ididx = []
dic_11 = []
dic_21 = []
dic_31 = []
dic_41 = []
for i in ididt:
path = urlparse(i).path
if path.count('/') == 1:
dic_11.append(i)
if path.count('/') == 2:
dic_21.append(i)
if path.count('/') == 3:
dic_31.append(i)
if path.count('/') > 3:
dic_41.append(i)
if dic_11:
ididx.append(random.choice(dic_11))
ididx.append(random.choice(dic_11))
#ididx.append(random.choice(dic_11))
if dic_21:
ididx.append(random.choice(dic_21))
ididx.append(random.choice(dic_21))
#ididx.append(random.choice(dic_21))
if dic_31:
ididx.append(random.choice(dic_31))
ididx.append(random.choice(dic_31))
#ididx.append(random.choice(dic_31))
if dic_41:
ididx.append(random.choice(dic_41))
ididx.append(random.choice(dic_41))
#ididx.append(random.choice(dic_41))
print('数据库 静态链接 : ' + self.url + ':' + str(list(set(hthtz))) + '\n')
print('数据库 动态链接 : ' + self.url + ':' + str(ididx) + '\n')
print('数据库 动态链接 : ' + self.url + ':' + str(list(set(ididz))) + '\n')
#return result_links
#return self.result
if __name__ == '__main__':
url = 'https://www.jd.com'
link = Get_Links(url)
link.Get_Result()