Security_Code/COMMON_CODE半成品-不再更新/获取网页中的可以注入的链接5.py

# -*- coding:utf-8 -*-
#__author__:langzi
#__blog__:www.langzi.fun
import contextlib
import pymysql
import re
import time
import requests
requests.packages.urllib3.disable_warnings()
from bs4 import BeautifulSoup
from urllib.parse import urlparse,urljoin
import random
from concurrent.futures import ThreadPoolExecutor


user = 'root'
passwd = 'root'
host = '127.0.0.1'
Dbname = 'langzi_scan_1'
port = '3306'

thread_s = 16
scan_level_s = 1
@contextlib.contextmanager
def connect_mysql():
    coon = pymysql.connect(user=user, passwd=passwd, host=host, db=Dbname, port=port, charset='utf8')
    cursor = coon.cursor()
    try:
        yield cursor
    except Exception as e:
        if '1062, "Duplicate entry ' in str(e):
            print('该网址重复')
        pass
    finally:
        coon.commit()
        cursor.close()
        coon.close()

import urllib.parse,os.path,re
class filter_url:
    def __init__(self):
        self.list_url_static=[]
    def filter_url(self,url):
        url=urllib.parse.urlparse(url)
        if url.query!='':
            return (self.params_filter(url))
            pass
        elif url.query=='':
            self.static_filter(url)
        elif url.path=='':
            return (url)
    def static_filter(self,url):
        #伪静态与url路径处理
        urls=os.path.splitext(url.path)
        if urls[1]!='':
            list_url=[]
            for i in urls[0].split('/'):
                if i!='':list_url.append('{%s:%s}'%(self.judgetype(i),len(i)))
            url_path="/".join(list_url)
            return (url.scheme + '://' + url.netloc +'/'+ url_path + urls[1])
        else:
            list_url=[]
            for i in url.path.split('/'):
                if i!='':list_url.append('{%s:%s}'%(self.judgetype(i),len(i)))
            url_path="/".join(list_url)
            return (url.scheme + '://' + url.netloc +'/'+ url_path)
    def params_filter(self,url):
        #url参数处理
        liststr = []
        try:
            liststr = []
            for i in url.query.split('&'):
                para = i.split('=')
                length_int = len(para[1])
                if self.judgetype(para[1]) == 'int':
                    para[1] = '{int:%s}' % length_int
                else:
                    para[1] = '{str:%s}' % length_int
                para = '='.join(para)
                liststr.append(para)
            url_paras='&'.join(liststr)
            return url.scheme + '://' + url.netloc + url.path + '?' + url_paras
        except:
            length_int = len(url.query)
            url_paras = '{'+self.judgetype(url.query) + ':%s}' % length_int
            return url.scheme + '://' + url.netloc + url.path + '?' + url_paras
    def callback_content(self,content):
        ret = re.split(r'-|_|\.',content)
    def judgetype(self, strs):
        try:
            int(strs)
            return 'int'
        except:
            return 'str'


headerss = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

import random
class Get_Links:
    def __init__(self,url):
        self.url = url
        self.headers = {'User-Agent':random.choice(headerss)}
        self.timeout = 15
        # self.domain = urlparse(self.url).netloc.replace('www.','').replace('/','').replace('.com.cn','').replace('.org.cn','').replace('.net.cn','').replace('.com','').replace('.cn','').replace('.cc','').replace('.net','').replace('.org','').replace('.info','').replace('.fun','').replace('.one','').replace('.xyz','').replace('.name','').replace('.io','').replace('.top','').replace('.me','').replace('.club','').replace('.tv','')
        # if self.domain.find('.')>0:
        #     self.domain = self.domain.split('.')[1]
        self.domain0 = urlparse(self.url).netloc.replace('/', '')
        self.domain = self.domain0.replace('www.','')
        self.domain = self.domain0


        self.result = {}
        self.all_links = []
        # 一开始获取所有的链接
        self.id_links = []
        # 带参数的动态链接
        self.html_links = []
        # 静态网页
        self.sche = 'http://'
        if 'http://' in self.url:
            self.sche = 'http://'
        else:
            self.sche = 'https://'
        self.mids = set()

    def Write_Logs(self,content):
        # 写入日志文件
        # 传入参数为字符串类型
        with open('log.txt', 'a+')as aa:
            aa.write('*********************************************' + '\n')
            aa.write(str(time.strftime('%Y-%m-%d:%H:%M:%S   ', time.localtime())) + str(content) + '\n')

    def Request(self,url):
        # 发起请求
        # 传入参数为url
        # 返回结果为没有编码的结果
        try:
            time.sleep(random.randint(1,5))
            r = requests.get(url=url,headers=self.headers,timeout=self.timeout,verify=False)
            # print('首次运行存活检测URL : {} 状态 : {}'.format(url, r.status_code))

            #encoding = requests.utils.get_encodings_from_content(r.text)[0]
            #res = r.content.decode(encoding,'replace')
            return r.content
        except Exception as e:
            self.Write_Logs(str(e))
            print('该网址访问失败:'+url)
            with connect_mysql() as coon:
                sql1 = 'insert into Sec_Fail_Links(url) values ("{}")'.format(url.rstrip('/'))
                coon.execute(sql1)

    def extract_URL(self,content):
        pattern_raw = r"""
    	  (?:"|')                               # Start newline delimiter
    	  (
    	    ((?:[a-zA-Z]{1,10}://|//)           # Match a scheme [a-Z]*1-10 or //
    	    [^"'/]{1,}\.                        # Match a domainname (any character + dot)
    	    [a-zA-Z]{2,}[^"']{0,})              # The domainextension and/or path
    	    |
    	    ((?:/|\.\./|\./)                    # Start with /,../,./
    	    [^"'><,;| *()(%%$^/\\\[\]]          # Next character can't be...
    	    [^"'><,;|()]{1,})                   # Rest of the characters can't be
    	    |
    	    ([a-zA-Z0-9_\-/]{1,}/               # Relative endpoint with /
    	    [a-zA-Z0-9_\-/]{1,}                 # Resource name
    	    \.(?:[a-zA-Z]{1,4}|action)          # Rest + extension (length 1-4 or action)
    	    (?:[\?|/][^"|']{0,}|))              # ? mark with parameters
    	    |
    	    ([a-zA-Z0-9_\-]{1,}                 # filename
    	    \.(?:php|asp|aspx|jsp|json|
    	         action|html|js|txt|xml)             # . + extension
    	    (?:\?[^"|']{0,}|))                  # ? mark with parameters
    	  )
    	  (?:"|')                               # End newline delimiter
    	"""
        pattern = re.compile(pattern_raw, re.VERBOSE)
        result = re.finditer(pattern, str(content))
        if result == None:
            return None
        js_url = []
        for match in result:
            if match.group() not in js_url:
                js_url.append(match.group().strip('"').strip("'"))
        return js_url

    def process_url(self, re_URL):
        black_url = ["javascript:"]  # Add some keyword for filter url.
        URL_raw = urlparse(self.url)
        ab_URL = URL_raw.netloc
        host_URL = URL_raw.scheme
        if re_URL[0:2] == "//":
            result = host_URL + ":" + re_URL
        elif re_URL[0:4] == "http":
            result = re_URL
        elif re_URL[0:2] != "//" and re_URL not in black_url:
            if re_URL[0:1] == "/":
                result = host_URL + "://" + ab_URL + re_URL
            else:
                if re_URL[0:1] == ".":
                    if re_URL[0:2] == "..":
                        result = host_URL + "://" + ab_URL + re_URL[2:]
                    else:
                        result = host_URL + "://" + ab_URL + re_URL[1:]
                else:
                    result = host_URL + "://" + ab_URL + "/" + re_URL
        else:
            result = self.url
        return result

    def find_last(string, str):
        positions = []
        last_position = -1
        while True:
            position = string.find(str, last_position + 1)
            if position == -1: break
            last_position = position
            positions.append(position)
        return positions

    def find_by_url(self,js=False):
        if js == False:
            html_raw = self.Extract_html(self.url)
            if html_raw == None:
                # print("Fail to access " + self.url)
                return None
            # print(html_raw)
            html = BeautifulSoup(html_raw, "html.parser")
            html_scripts = html.findAll("script")
            script_array = {}
            script_temp = ""
            for html_script in html_scripts:
                script_src = html_script.get("src")
                if script_src == None:
                    script_temp += html_script.get_text() + "\n"
                else:
                    purl = self.process_url(self.url, script_src)
                    script_array[purl] = self.Extract_html(purl)
            script_array[self.url] = script_temp
            allurls = []
            for script in script_array:
                # print(script)
                temp_urls = self.extract_URL(script_array[script])
                if len(temp_urls) == 0: continue
                for temp_url in temp_urls:
                    allurls.append(self.process_url(script, temp_url))
            result = []
            for singerurl in allurls:
                url_raw = urlparse(self.url)
                domain = url_raw.netloc
                positions = self.find_last(domain, ".")
                miandomain = domain
                if len(positions) > 1: miandomain = domain[positions[-2] + 1:]
                # print(miandomain)
                suburl = urlparse(singerurl)
                subdomain = suburl.netloc
                # print(singerurl)
                if miandomain in subdomain or subdomain.strip() == "":
                    if singerurl.strip() not in result:
                        result.append(singerurl)
            return result
        else:
            temp_urls = self.extract_URL(self.Request(self.url))
            if len(temp_urls) == 0: return None
            result = []
            for temp_url in temp_urls:
                if temp_url not in result:
                    result.append(temp_url)
            return result

    def Get_All_Links(self,url):
        content = self.Request(url)
        if content == None:
            content = self.Request(url)
            if content == None:
                return None
        _links = []
        # 接收参数为网页的内容
        # 返回结果为网页中全部的链接
        # 包括动态链接和静态网址和目录
        soup = BeautifulSoup(content, 'html.parser',from_encoding='iso-8859-1')
        links = soup.findAll('a')
        if links != None:
            for link in links:
                _url = link.get('href')
                res = re.search('(javascript|:;|#|%)', str(_url))
                res1 = re.search('.(jpg|png|gif|jpeg|mp4|css|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|\.w3\.org)', str(_url))
                if res == None and res1 == None:
                    _links.append(str(_url).replace(r'\\','').rstrip('\\'))
                else:
                    pass

        links2 = self.find_by_url(self.url)
        if links2 != None:
            for link in links2:
                res = re.search('(javascript|:;|#|%)', str(link))
                res1 = re.search('.(jpg|png|gif|jpeg|mp4|css|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|\.w3\.org)', str(link))
                if res == None and res1 == None:
                    _links.append(str(link).replace(r'\\','').rstrip('\\'))
                else:
                    pass
        if _links != []:
            return _links
        else:
            return None

    def Get_Dir_Links(self,content):
        # 对上面那个获取所有链接进行整理
        dir_links = []
        if content != None:
            rst = list(set(content))
            for rurl in rst:
                if rurl.startswith('http') and '://' in rurl and self.domain in rurl:
                    # http://www.baidu.com
                    if rurl.rstrip('/') != self.url:
                        dir_links.append(rurl.strip())

                if 'http' not in rurl and self.domain in rurl:
                    if 'www' in self.url:
                        if 'www' in rurl:
                            dir_links.append(self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
                        else:
                            dir_links.append(self.sche + 'www.'+rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
                    else:
                        dir_links.append(
                            self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//', '').replace(
                                ':', ''))

                if 'http' not in rurl and self.domain not in rurl and ':' not in rurl and '//' not in rurl:
                    # /sttd/xhm/
                    dir_links.append(self.sche  + self.domain0 + '/' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))

                if rurl.startswith('://') and 'http' not in rurl and self.domain in rurl:
                    if self.sche + rurl.replace('://','').rstrip('/') != self.url:
                        dir_links.append(self.sche + rurl.replace('://',''))
                if rurl.startswith('//') and self.domain in rurl :
                    # //order.jd.com/center/list.action
                    if self.sche + rurl.replace('//','').rstrip('/') != self.url:
                        dir_links.append(self.sche + rurl.replace('//',''))

            dir_links = list(set(dir_links))
            html_links = []
            no_html_links = []
            for i in dir_links:
                if 'htm' in i:
                    html_links.append(i)
                else:
                    no_html_links.append(i)
            if len(html_links) > 30:
                html_links = random.sample(html_links,10)
            dir_links = html_links + no_html_links
            # print(dir_links)
            if len(dir_links) > 100:
                dir_links = random.sample(dir_links,60)
            if dir_links != []:
                return dir_links
            else:
                return None

    def Get_Ht_Id_Links(self,content):
        # 接受的参数content 是列表
        # 返回结果是一个字典
        id_links = []
        html_links = []
        ht_id_result = {'ID':id_links,'HT':html_links}
        if content != None:
            rst = list(set(content))
            for rurl in rst:
                if rurl.startswith('http') and '://' in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    # http://www.baidu.com
                    if '?' in rurl and '=' in rurl:
                        # result_links.append(rurl)
                        id_links.append(rurl.strip())
                    if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                        if '?' not in rurl:
                            # result_links.append(rurl)
                            html_links.append(rurl.strip())

                if 'http' not in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    if 'www' in self.url:
                        if 'www' in rurl:
                            if '?' in rurl and '=' in rurl:
                                id_links.append(self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
                            if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                                if '?' not in rurl:
                                    # result_links.append(rurl)
                                    html_links.append(
                                        self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//',
                                                                                                                 '').replace(
                                            ':', ''))
                        else:
                            if '?' in rurl and '=' in rurl:
                                id_links.append(self.sche + 'www.'+rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
                            if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                                if '?' not in rurl:
                                    # result_links.append(rurl)
                                    html_links.append(
                                        self.sche + 'www.' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip(
                                            '.').replace('//', '').replace(':', ''))
                    else:
                        if '?' in rurl and '=' in rurl:
                            id_links.append(
                            self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//', '').replace(
                                ':', ''))
                        if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                            if '?' not in rurl:
                                # result_links.append(rurl)
                                html_links.append(self.sche + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//', '').replace(
                                ':', ''))

                if 'http' not in rurl and self.domain not in rurl and ':' not in rurl and '//' not in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    # /sttd/xhm/
                    if '?' in rurl and '=' in rurl:
                        id_links.append(self.sche  + self.domain0 + '/' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip('.').replace('//','').replace(':',''))
                    if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                        if '?' not in rurl:
                            # result_links.append(rurl)
                            html_links.append(
                                self.sche + self.domain0 + '/' + rurl.lstrip('/').lstrip('.').rstrip('/').rstrip(
                                    '.').replace('//', '').replace(':', ''))

                if rurl.startswith('://') and 'http' not in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    if '?' in rurl and '=' in rurl:
                        id_links.append(self.sche + rurl.replace('://',''))
                    if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                        if '?' not in rurl:
                            # result_links.append(rurl)
                            html_links.append(self.sche + rurl.replace('://',''))

                if rurl.startswith('//') and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    # //order.jd.com/center/list.action
                    if '?' in rurl and '=' in rurl:
                        id_links.append(self.sche + rurl.replace('//',''))
                    if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                        if '?' not in rurl:
                            # result_links.append(rurl)
                            html_links.append(self.sche + rurl.replace('//',''))


                if '//' in rurl and rurl.startswith('http') and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    # http // domain 都在
                    # https://www.yamibuy.com/cn/search.php?tags=163
                    # http://news.hnu.edu.cn/zhyw/2017-11-11/19605.html
                        if '?' in rurl and '=' in rurl:
                            # result_links.append(rurl)
                            id_links.append(rurl.strip())
                        if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                            if '?' not in rurl:
                                # result_links.append(rurl)
                                html_links.append(rurl.strip())
                # //wmw.dbw.cn/system/2018/09/25/001298805.shtml
                if 'http' not in rurl and self.domain in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    # http 不在    domain 在
                    if '?' in rurl and '=' in rurl:
                        id_links.append(self.sche + rurl.lstrip('/').lstrip('.').strip().lstrip('/'))
                    if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                        if '?' not in rurl:
                            html_links.append(self.sche + rurl.lstrip('/').lstrip('.').strip().lstrip('/'))

                # /chanpin/2018-07-12/3.html"
                if 'http' not in rurl and self.domain not in rurl and '.js?' not in rurl and '.min.js' not in rurl:
                    # http 不在  domain 不在
                    if '?' in rurl and '=' in rurl:
                        id_links.append(self.sche + self.domain0.strip() + '/' + rurl.strip().lstrip('/').lstrip('.').lstrip('/'))
                    if '.html' in rurl or '.shtml' in rurl or '.htm' in rurl or '.shtm' in rurl:
                        if '?' not in rurl:
                            html_links.append(self.sche + self.domain0.strip() + '/' + rurl.strip().lstrip('/').lstrip('.').lstrip('/'))

            if len(html_links)>100:
                html_links=random.sample(html_links,50)
            if len(id_links)>100:
                id_links = random.sample(id_links,50)

            ht_id_result['ID'] = list(set(id_links))
            ht_id_result['HT'] = list(set(html_links))
            return ht_id_result

    def Filter(self,par1,lis1):
        for i in lis1:
            try:
                #print('开始对比 {}:::::::::::::::::{}'.format(par1.encode(),i.encode()))
                res = re.search(par1.encode(),i.encode())
                if res:
                    #print('发现对比存活!!!!!!!!')
                    return i
            except Exception as e:
                #print(e)
                self.Write_Logs(str(e))

    def Get_Result(self):
        Link = self.Get_All_Links(self.url)
        print('--------第一次获取 {} 数据----------'.format(self.url))
        print('所有链接:{}'.format(len(Link)))
        print(Link)
        if Link:
            self.html_links.extend(self.Get_Ht_Id_Links(Link).get('HT'))
            print('静态链接:{}'.format(len(self.html_links)))
            print(self.html_links)
            self.id_links.extend(self.Get_Ht_Id_Links(Link).get('ID'))
            print('动态链接:{}'.format(len(self.id_links)))
            print(self.id_links)
            Dirs = self.Get_Dir_Links(Link)
            if Dirs:
                print('目录链接:{}'.format(len(Dirs)))
                print(Dirs)
                for dir in Dirs:
                    Links = self.Get_All_Links(dir)
                    if Links:
                        self.html_links.extend(self.Get_Ht_Id_Links(Links).get('HT'))
                        self.id_links.extend(self.Get_Ht_Id_Links(Links).get('ID'))
                self.html_links = list(set(self.html_links))
                self.id_links = list(set(self.id_links))
                print('--------第二次获取 {} 数据----------'.format(self.url))
                print('静态链接:{}'.format(len(self.html_links)))
                print(self.html_links)
                print('动态链接:{}'.format(len(self.id_links)))
                print(self.id_links)

            idido = []
            htht = list(set(self.html_links))
            hthtx = []
            dic_1 = []
            dic_2 = []
            dic_3 = []
            dic_4 = []
            for i in htht:
                path = urlparse(i).path
                if path.count('/') == 1:
                    dic_1.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
                if path.count('/') == 2:
                    dic_2.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
                if path.count('/') == 3:
                    dic_3.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
                if path.count('/') > 3:
                    dic_4.append(i.replace('.htm', '*.htm').replace('.shtm', '*.shtm'))
            if dic_1:
                hthtx.append(random.choice(dic_1))
                hthtx.append(random.choice(dic_1))
                #hthtx.append(random.choice(dic_1))
            if dic_2:
                hthtx.append(random.choice(dic_2))
                hthtx.append(random.choice(dic_2))
                #hthtx.append(random.choice(dic_2))
            if dic_3:
                hthtx.append(random.choice(dic_3))
                hthtx.append(random.choice(dic_3))
                #hthtx.append(random.choice(dic_3))
            if dic_4:
                hthtx.append(random.choice(dic_4))
                hthtx.append(random.choice(dic_4))
                #hthtx.append(random.choice(dic_4))


            #self.html_links = hthtx

            #print(self.html_links)
            #print('静态页面数：{}'.format(len(hthtx)))


            p = filter_url()
            for i in self.id_links:
                mid = str(re.sub('{.*?}','.*?',str(p.filter_url(i))))
                if mid in self.mids:
                    pass
                else:
                    self.mids.add(mid)
                    idido.append(i)
            idido = list(set(idido))
            if len(idido)>100:
                idido = random.sample(idido,100)


            print('--------第三次获取 {} 数据----------'.format(self.url))
            print('静态链接:{}'.format(len(hthtx)))
            print(self.html_links)
            print('动态链接:{}'.format(len(idido)))
            print(self.id_links)

            ididz = []
            hthtz = []
            for i in idido:
                time.sleep(random.randint(2,5))
                try:
                    r = requests.get(url=i,headers=self.headers,timeout=self.timeout,verify=False)
                    #print('最终运行存活检测URL : {} 状态 : {}'.format(i,r.status_code))
                    if r.status_code == 200 or r.status_code == 302 or r.status_code == 301:
                        ididz.append(i.replace('\n',''))
                        if '?' in r.url and '=' in r.url:
                            ididz.append(r.url.replace('\n','').replace(';','').replace(',','').replace(' ',''))
                except Exception as e:
                    self.Write_Logs(str(e))
                    with connect_mysql() as coon:
                        print('访问失败:{}'.format(i))
                        sql1 = 'insert into Sec_Fail_Links(url) values ("{}")'.format(i)
                        coon.execute(sql1)


            for i in hthtx:
                time.sleep(random.randint(2,5))
                try:
                    r = requests.get(url=i.replace('*',''),headers=self.headers,timeout=self.timeout,verify=False)
                    #print('最终运行存活检测URL : {} 状态 : {}'.format(i,r.status_code))
                    if r.status_code == 200 or r.status_code == 302 or r.status_code == 301:
                        hthtz.append(i.replace('\n','').replace(';','').replace(',','').replace(' ',''))
                except Exception as e:
                    self.Write_Logs(str(e))
                    with connect_mysql() as coon:
                        print('访问失败:{}'.format(i))
                        sql1 = 'insert into Sec_Fail_Links(url) values ("{}")'.format(i)
                        coon.execute(sql1)

            ididt = list(set(ididz))
            ididx = []
            dic_11 = []
            dic_21 = []
            dic_31 = []
            dic_41 = []
            for i in ididt:
                path = urlparse(i).path
                if path.count('/') == 1:
                    dic_11.append(i)
                if path.count('/') == 2:
                    dic_21.append(i)
                if path.count('/') == 3:
                    dic_31.append(i)
                if path.count('/') > 3:
                    dic_41.append(i)
            if dic_11:
                ididx.append(random.choice(dic_11))
                ididx.append(random.choice(dic_11))
                #ididx.append(random.choice(dic_11))
            if dic_21:
                ididx.append(random.choice(dic_21))
                ididx.append(random.choice(dic_21))
                #ididx.append(random.choice(dic_21))
            if dic_31:
                ididx.append(random.choice(dic_31))
                ididx.append(random.choice(dic_31))
                #ididx.append(random.choice(dic_31))
            if dic_41:
                ididx.append(random.choice(dic_41))
                ididx.append(random.choice(dic_41))
                #ididx.append(random.choice(dic_41))

            print('数据库 静态链接 : ' + self.url + ':' + str(list(set(hthtz))) + '\n')
            print('数据库 动态链接 : ' + self.url + ':' + str(ididx) + '\n')
            print('数据库 动态链接 : ' + self.url + ':' + str(list(set(ididz))) + '\n')


                #return result_links
            #return self.result


if __name__ == '__main__':
    url = 'https://www.jd.com'
    link = Get_Links(url)
    link.Get_Result()