Security_Code/COMMON_CODE半成品-不再更新/下载/爬取动态超链接.py

97 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding:utf8
import requests
import re
import threading
import lxml.html
import urlparse
import time
from Queue import Queue
import sqlite3
import vul_module
import urllib2
import chardet
from config import *
class SpiderThread(threading.Thread):
def __init__(self, target, url, logfile, module):
threading.Thread.__init__(self)
self.target = target
self.deep_url = url
self.logfile = logfile
self.module = module
def GetLinks(self, host, html):
link_list = []
try:
tmp = lxml.html.document_fromstring(html)
tmp.make_links_absolute(host)
links = tmp.iterlinks()
link_list = list(set([i[2] for i in links]))
except Exception, e:
print
get_ctime() + '\tXml error:', str(e)
self.logfile.write(get_ctime() + '\tXml error:' + str(e) + '\turl:' + self.deep_url[1] + '\n')
self.logfile.flush()
return link_list
def SpiderPage(self):
link_tmp = []
try:
html = urllib2.urlopen(self.deep_url[1], timeout=10).read().lower()
if chardet.detect(html)['encoding'] == 'GB2312':
html = html.decode('gb2312').encode('utf8')
link_list = self.GetLinks(self.target, html)
for i in link_list:
ext = urlparse.urlparse(i)[2].split('.')[-1]
if ext not in IGNORE_EXT:
link_tmp.append(i)
except Exception, e:
print get_ctime() + '\tHttp error:', str(e)
self.logfile.write(get_ctime() + '\tHttp error:' + str(e) + '\turl:' + self.deep_url[1] + '\n')
self.logfile.flush()
link_list = link_tmp
return link_list
def url_similar_check(self, url):
'''
URL相似度分析
当url路径和参数键值类似时则判为重复
'''
global SIMILAR_SET
url_struct = urlparse.urlparse(url)
query_key = '|'.join(sorted([i.split('=')[0] for i in url_struct.query.split('&')]))
url_hash = hash(url_struct.path + query_key)
if url_hash not in SIMILAR_SET:
SIMILAR_SET.add(url_hash)
return True
return False
def run(self):
global QUEUE
global TOTAL_URL
tmp_link_list = self.SpiderPage()
pre_url_list = TOTAL_URL
# 判断link是否是在目标域下
link_list = []
for i in tmp_link_list:
if urlparse.urlparse(self.target).netloc == urlparse.urlparse(i).netloc:
link_list.append(i)
TOTAL_URL = TOTAL_URL | set(link_list)
new_url_list = list(TOTAL_URL - pre_url_list)
# 添加deep信息,并进行url相似度分析
depth = self.deep_url[0] + 1
for i in range(len(new_url_list)):
if self.url_similar_check(new_url_list[i]):
print
get_ctime() + '\tCrawl url:' + new_url_list[i]
vul_module.vul_module(new_url_list[i], self.logfile).check(self.module)
self.logfile.write(get_ctime() + '\tCrawl url:' + new_url_list[i] + ',depth:' + str(depth) + '\n')
self.logfile.flush()
QUEUE.append([depth, new_url_list[i]])