Security_Code/COMMON_CODE半成品-不再更新/URL去重处理.py

76 lines
2.9 KiB
Python

# -*- coding:utf-8 -*-
#__author__:langzi
#__blog__:www.langzi.fun
# coding: utf-8
'''
Created on 2018年7月1日
@author: guimaizi
'''
import urllib.parse,os.path,re
class filter_url:
def __init__(self):
self.list_url_static=[]
def filter_url(self,url):
url=urllib.parse.urlparse(url)
if url.query!='':
print(self.params_filter(url))
pass
elif url.query=='':
self.static_filter(url)
elif url.path=='':
print(url)
def static_filter(self,url):
#伪静态与url路径处理
urls=os.path.splitext(url.path)
if urls[1]!='':
list_url=[]
for i in urls[0].split('/'):
if i!='':list_url.append('{%s:%s}'%(self.judgetype(i),len(i)))
url_path="/".join(list_url)
print(url.scheme + '://' + url.netloc +'/'+ url_path + urls[1])
else:
list_url=[]
for i in url.path.split('/'):
if i!='':list_url.append('{%s:%s}'%(self.judgetype(i),len(i)))
url_path="/".join(list_url)
print(url.scheme + '://' + url.netloc +'/'+ url_path)
def params_filter(self,url):
#url参数处理
liststr = []
try:
liststr = []
for i in url.query.split('&'):
para = i.split('=')
length_int = len(para[1])
if self.judgetype(para[1]) == 'int':
para[1] = '{int:%s}' % length_int
else:
para[1] = '{str:%s}' % length_int
para = '='.join(para)
liststr.append(para)
url_paras='&'.join(liststr)
return url.scheme + '://' + url.netloc + url.path + '?' + url_paras
except:
length_int = len(url.query)
url_paras = '{'+self.judgetype(url.query) + ':%s}' % length_int
return url.scheme + '://' + url.netloc + url.path + '?' + url_paras
def callback_content(self,content):
ret = re.split(r'-|_|\.',content)
def judgetype(self, strs):
try:
int(strs)
return 'int'
except:
return 'str'
if __name__ == '__main__':
urlss = ['http://www.target.cn/zxft/20483.htm?dsdsa','http://www.target.cn/zxft.php',\
'http://www.target.cn/zxft/20483.htm','http://www.target.cn/zxft/20483.htm?dsdsa=dsadsa&dada=1',\
'http://www.target.cn/zxft/31231.htm','http://www.target.cn/zxft/31231',\
'http://www.target.cn/','http://www.target.cn/zxft/20483.htm?dsdsa=ds1adsa&dada=231231',\
'http://www.target.cn/dsadsa/','http://www.target.cn/2131','http://www.target.cn/user',\
'http://www.target.cn/da1s_dasd/','http://www.target.cn/das_dasd?das=121','http://www.target.cn/index.php/thanks',\
'http://www.target.cn/?a=dasd']
p = filter_url()
for i in urlss:
p.filter_url(i)