86 lines
3.6 KiB
Python
86 lines
3.6 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
__author__ = 'Langziyanqin'
|
||
__QQ__ = '982722261'
|
||
┏┓ ┏┓
|
||
┏┛┻━━━┛┻┓
|
||
┃ ☃ ┃
|
||
┃ ┳┛ ┗┳ ┃
|
||
┃ ┻ ┃
|
||
┗━┓ ┏━┛
|
||
┃ ┗━━━┓
|
||
┃ 神兽保佑 ┣┓
|
||
┃ 永无BUG! ┏┛
|
||
┗┓┓┏━┳┓┏┛
|
||
┃┫┫ ┃┫┫
|
||
┗┻┛ ┗┻┛
|
||
"""
|
||
import sys
|
||
import os
|
||
import requests
|
||
import re
|
||
import time
|
||
import threadpool
|
||
import random
|
||
reload(sys)
|
||
sys.setdefaultencoding('utf-8')
|
||
print '''
|
||
|
||
| __ __ __
|
||
|_, (__( | ) (__|
|
||
__/
|
||
|
||
'''
|
||
time.sleep(1)
|
||
url = 'http://maoyan.com/board/4?offset='
|
||
|
||
headerss = [
|
||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
|
||
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
|
||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
|
||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
|
||
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
|
||
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
|
||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
|
||
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
|
||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
|
||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
|
||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
|
||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
|
||
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
|
||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
|
||
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
|
||
|
||
|
||
|
||
def get_html(urlx):
|
||
try:
|
||
UA = random.choice(headerss)
|
||
headers = {'User-Agent':UA}
|
||
req = requests.get(url=urlx,headers=headers)
|
||
except Exception, e:
|
||
print e
|
||
finally:
|
||
requ = req.content
|
||
print requ
|
||
print '---------------------------------------'
|
||
get_parser(requ)
|
||
|
||
def get_parser(requ):
|
||
pattern = re.compile(r'<dd>.*?board-index.*?">(\d+)</i>.*?title="(.*?)".*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(\d+)</i>',re.S)
|
||
html = re.findall(pattern, requ)
|
||
for x,y,z,t,e,r in html:
|
||
print x,y,z,t,str(e+r)
|
||
def main():
|
||
for i in range(0,100,10):
|
||
urlx = url + str(i)
|
||
print 'Crawl:' + urlx
|
||
get_html(urlx)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|