#!/usr/bin/env perl # hui # scrap_proxy.pl use strict; use URI::Escape; use Web::Scraper; use Furl; use Data::Dumper; my $ua = Furl->new(agent => "Lynx", timeout => "15"); my $url = 'https://www.google.com/search?q=http+proxy+3128+80&hl=en&newwindow=1&tbo=1&tbs=qdr:d,sbd:1&prmd=imvns&source=lnt&start=0&sa=N'; my $pages = 5; # scrap first 5 pages my $step = 10; # 10 results per page my $sleep = 5; # sleep 5 seconds for each request my @result; for my $n (1..$pages) { warn "Checking page $n\n"; if ( $n > 0) { my $i = ($n - 1) * $step; my $start = "start=$i"; $url =~ s/\bstart=\d+\b/$start/; } my $crawler = scraper { process "a", "link[]" => '@href'; }; my $res = $crawler->scrape( $ua->get($url)->content ); for my $link (@{$res->{'link'}}) { next unless $link =~ /^\/url\?/; push @result, uri_unescape($1) if $link =~ /[?&]q=(http:[^\s?&]+?)(?:$|&)/; } sleep $sleep; } warn "Get ".scalar(@result)." targets\n"; my %proxies; for my $target (@result) { warn " -> processing $target\n"; my $content = $ua->get($target)->content; $content =~ s/\s*<\/td>\s*]*?>\s*/:/gm; while( $content =~ /\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s:]+(\d{3,5})\b/g ) { my $ip = "$1:$2"; $proxies{$ip}++ unless $ip =~ /^(?:192\.168|127\.0|10\.)/; } } print "$_\n" for keys %proxies;