Improving crawl

git-svn-id: file:///home/svn/framework3/trunk@8861 4d416f70-5f16-0410-b530-b9f4589650da
This commit is contained in:
et 2010-03-21 00:12:28 +00:00
parent 0b996801d0
commit eb61f72431
1 changed files with 50 additions and 5 deletions

View File

@ -56,6 +56,9 @@ $proxyhost = '127.0.0.1'
# Proxy Port
$proxyport = 8080
# Cookie Jar
$cookiejar = {}
class HttpCrawler
attr_accessor :ctarget, :cport, :cinipath, :cssl, :proxyhost, :proxyport, :useproxy
@ -141,8 +144,10 @@ class HttpCrawler
begin
loop do
hashreq = @NotViewedQueue.take(reqtemplate(self.ctarget,self.cport,self.cssl), $taketimeout)
#puts hashreq
reqfilter = reqtemplate(self.ctarget,self.cport,self.cssl)
hashreq = @NotViewedQueue.take(reqfilter, $taketimeout)
if !@ViewedQueue.include?(hashsig(hashreq))
@ViewedQueue[hashsig(hashreq)] = Time.now
@ -166,6 +171,7 @@ class HttpCrawler
sendreq(c,hashreq)
# })
# i += 1
@ -215,6 +221,7 @@ class HttpCrawler
def sendreq(nclient,reqopts={})
#puts reqopts
puts ">> #{reqopts['uri']}"
if reqopts['query']
@ -234,6 +241,11 @@ class HttpCrawler
# In case modules or crawler calls to_s on de-chunked responses
#
resp.transfer_chunked = false
if resp['Set-Cookie']
#puts "SET COOKIE: #{resp['Set-Cookie']}"
#puts "Storing in cookie jar for host:port #{reqopts['rhost']}:#{reqopts['rport']}"
$cookiejar["#{reqopts['rhost']}:#{reqopts['rport']}"] = resp['Set-Cookie']
end
#puts ("#{resp.to_s}")
#puts "resp code #{resp.code}"
@ -248,8 +260,9 @@ class HttpCrawler
@crawlermodules.each_key do |k|
@crawlermodules[k].parse(reqopts,resp)
end
when 301
puts "Redirection"
when 302
puts "(#{resp.code}) Redirection to: #{resp['Location']}"
insertnewpath(urltohash(resp['Location']))
when 404
puts "Invalid link (404) #{reqopts['uri']}"
else
@ -273,7 +286,7 @@ class HttpCrawler
if @NotViewedQueue.read_all(hashreq).size > 0
#puts "Already in queue to be viewed"
else
#puts "I: #{hashreq['uri']}"
#puts "Inserted: #{hashreq['uri']}"
@NotViewedQueue.write(hashreq)
end
else
@ -282,6 +295,38 @@ class HttpCrawler
end
end
#
# Build a new hash for a local path
#
def urltohash(url)
uri = URI.parse(url)
tssl = (uri.scheme == "https") ? true : false
if (uri.host.nil? or uri.host.empty?)
uritargethost = self.ctarget
uritargetport = self.cport
uritargetssl = self.cssl
else
uritargethost = uri.host
uritargetport = uri.port
uritargetssl = tssl
end
hashreq = {
'rhost' => uritargethost,
'rport' => uritargetport,
'uri' => uri.path,
'method' => 'GET',
'ctype' => 'text/plain',
'ssl' => uritargetssl,
'query' => uri.query
}
#puts hashreq
return hashreq
end
def hashsig(hashreq)
hashreq.to_s
end