proxy_scraper/scrap_proxy.pl

53 lines
1.4 KiB
Perl
Executable File

#!/usr/bin/env perl
# hui
# scrap_proxy.pl
use strict;
use URI::Escape;
use Web::Scraper;
use Furl;
use Data::Dumper;
my $ua = Furl->new(agent => "Lynx", timeout => "15");
my $url = 'https://www.google.com/search?q=http+proxy+3128+80&hl=en&newwindow=1&tbo=1&tbs=qdr:d,sbd:1&prmd=imvns&source=lnt&start=0&sa=N';
my $pages = 5; # scrap first 5 pages
my $step = 10; # 10 results per page
my $sleep = 5; # sleep 5 seconds for each request
my @result;
for my $n (1..$pages) {
warn "Checking page $n\n";
if ( $n > 0) {
my $i = ($n - 1) * $step;
my $start = "start=$i";
$url =~ s/\bstart=\d+\b/$start/;
}
my $crawler = scraper { process "a", "link[]" => '@href'; };
my $res = $crawler->scrape( $ua->get($url)->content );
for my $link (@{$res->{'link'}}) {
next unless $link =~ /^\/url\?/;
push @result, uri_unescape($1) if $link =~ /[?&]q=(http:[^\s?&]+?)(?:$|&)/;
}
sleep $sleep;
}
warn "Get ".scalar(@result)." targets\n";
my %proxies;
for my $target (@result) {
warn " -> processing $target\n";
my $content = $ua->get($target)->content;
$content =~ s/\s*<\/td>\s*<td[^<>]*?>\s*/:/gm;
while( $content =~ /\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s:]+(\d{3,5})\b/g ) {
my $ip = "$1:$2";
$proxies{$ip}++ unless $ip =~ /^(?:192\.168|127\.0|10\.)/;
}
}
print "$_\n" for keys %proxies;