init proxy_scraper

This commit is contained in:
Hui Wang 2014-08-23 19:40:40 +08:00
commit ba5157c21c
7 changed files with 2290 additions and 0 deletions

11
README Normal file
View File

@ -0,0 +1,11 @@
Following modules are required:
Furl
AnyEvent::HTTP
Web::Scraper
You can use cpanm (http://cpanmin.us) to install above modules easily.
cpanm Furl AnyEvent::HTTP Web::Scraper
Run ./run.sh to generate available proxies and you can find the output file ",proxylist" in "data" directory.

49
check_proxy.pl Executable file
View File

@ -0,0 +1,49 @@
#!/usr/bin/env perl
# hui
# check_proxy.pl
use strict;
use AnyEvent::HTTP;
$| = 1;
my $MAX_PROC = 1000;
my $timeout = 10;
$AnyEvent::HTTP::MAX_RECURSE = 0;
$AnyEvent::HTTP::MAX_PER_HOST = $MAX_PROC;
my $target = "http://www.google.co.jp";
my $start = time;
my $cv = AnyEvent->condvar;
_check(*STDIN);
$cv->recv;
my $end = time;
print "time: ".($end - $start)."s\n";
sub _check {
my $fh = shift;
while($AnyEvent::HTTP::ACTIVE < $MAX_PROC) {
my $proxy = <$fh>;
defined $proxy or last;
$proxy =~ s/\s//g;
http_request(
GET => $target,
headers => { "user-agent" => "Mozilla/5.0" },
timeout => $timeout,
proxy => [split(/:/, $proxy)],
on_header => sub {
if ($_[0]{'server'} eq "gws") {
print "$proxy ok!\n";
} else {
print "$proxy failed!\n";
}
return 0;
},
sub { _check($fh) },
);
}
$cv->send if ($AnyEvent::HTTP::ACTIVE == 0 && eof($fh));
}

1754
data/,proxylist Normal file

File diff suppressed because it is too large Load Diff

19
run.sh Executable file
View File

@ -0,0 +1,19 @@
#!/bin/bash
# run.sh
PERL=$(which perl)
SCRAPER=scrap_proxy.pl
CHECKER=check_proxy.pl
SCRAP_RESULT=tmp/,scrap_result
CHECK_RESULT=tmp/,check_result
PROXYLIST=data/,proxylist
echo "Scrapping proxy search result from Google..."
$PERL $SCRAPER > $SCRAP_RESULT
echo "Checking proxy connectivity..."
grep : $SCRAP_RESULT | $PERL $CHECKER | tee $CHECK_RESULT
grep ok $CHECK_RESULT | awk '{print $1}' >> $PROXYLIST
echo -e "All done!\nAvailable proxies are listed in file $PROXYLIST.\n"

52
scrap_proxy.pl Executable file
View File

@ -0,0 +1,52 @@
#!/usr/bin/env perl
# hui
# scrap_proxy.pl
use strict;
use URI::Escape;
use Web::Scraper;
use Furl;
use Data::Dumper;
my $ua = Furl->new(agent => "Lynx", timeout => "15");
my $url = 'https://www.google.com/search?q=http+proxy+3128+80&hl=en&newwindow=1&tbo=1&tbs=qdr:d,sbd:1&prmd=imvns&source=lnt&start=0&sa=N';
my $pages = 5; # scrap first 5 pages
my $step = 10; # 10 results per page
my $sleep = 5; # sleep 5 seconds for each request
my @result;
for my $n (1..$pages) {
warn "Checking page $n\n";
if ( $n > 0) {
my $i = ($n - 1) * $step;
my $start = "start=$i";
$url =~ s/\bstart=\d+\b/$start/;
}
my $crawler = scraper { process "a", "link[]" => '@href'; };
my $res = $crawler->scrape( $ua->get($url)->content );
for my $link (@{$res->{'link'}}) {
next unless $link =~ /^\/url\?/;
push @result, uri_unescape($1) if $link =~ /[?&]q=(http:[^\s?&]+?)(?:$|&)/;
}
sleep $sleep;
}
warn "Get ".scalar(@result)." targets\n";
my %proxies;
for my $target (@result) {
warn " -> processing $target\n";
my $content = $ua->get($target)->content;
$content =~ s/\s*<\/td>\s*<td[^<>]*?>\s*/:/gm;
while( $content =~ /\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s:]+(\d{3,5})\b/g ) {
my $ip = "$1:$2";
$proxies{$ip}++ unless $ip =~ /^(?:192\.168|127\.0|10\.)/;
}
}
print "$_\n" for keys %proxies;

99
tmp/,check_result Normal file
View File

@ -0,0 +1,99 @@
58.64.168.208:3128 ok!
202.185.100.225:3128 ok!
103.244.2.114:3128 ok!
58.20.127.100:3128 failed!
118.189.1.186:3128 ok!
202.146.144.30:8080 ok!
220.173.59.36:8080 ok!
118.163.243.40:8080 ok!
218.104.148.59:3128 failed!
106.187.38.45:3128 ok!
114.66.219.90:8080 ok!
111.192.57.101:9000 ok!
117.40.160.45:3128 ok!
110.74.197.26:8080 ok!
222.88.240.27:9999 ok!
124.47.57.2:9000 ok!
27.46.21.67:9999 ok!
112.65.19.122:8080 ok!
124.78.81.53:8080 ok!
118.114.77.116:8080 ok!
180.173.85.204:8080 ok!
111.67.74.178:8000 ok!
210.14.147.71:3128 ok!
183.220.195.112:8123 failed!
66.146.193.31:8118 failed!
192.254.70.245:8080 ok!
199.185.61.3:8081 failed!
175.41.246.120:17403 ok!
202.98.123.126:8080 ok!
201.159.23.150:3128 ok!
108.177.171.106:33948 ok!
183.178.23.103:3128 ok!
187.188.195.66:8080 ok!
181.114.59.203:8080 ok!
118.96.172.6:3128 ok!
77.95.194.142:3128 ok!
118.99.114.113:3128 ok!
159.226.61.60:3128 ok!
186.3.52.166:3128 ok!
219.150.205.35:8080 ok!
173.9.143.222:7004 ok!
176.223.101.66:8080 ok!
188.111.69.178:8080 ok!
195.175.201.242:8080 ok!
117.171.242.248:8123 failed!
91.121.158.63:3128 ok!
195.190.117.50:3128 ok!
110.208.27.82:9000 ok!
201.49.209.147:3128 failed!
189.59.219.202:8080 ok!
41.223.53.170:443 ok!
202.52.152.210:8080 ok!
204.84.216.200:3128 ok!
109.70.145.125:3040 ok!
110.208.27.114:9000 ok!
117.164.169.181:8123 failed!
190.128.234.130:3128 ok!
117.121.242.8:18888 ok!
190.79.159.101:8080 ok!
110.208.26.35:9000 ok!
201.208.106.200:8080 ok!
118.174.149.118:8080 ok!
124.81.121.238:8080 ok!
117.169.239.163:8123 failed!
186.226.172.91:8080 ok!
184.107.204.85:3128 ok!
177.38.40.11:3128 ok!
125.124.115.193:8080 ok!
177.69.67.253:3128 ok!
182.253.32.100:8080 ok!
186.228.78.177:3128 ok!
190.42.142.22:8080 ok!
186.228.78.130:3128 ok!
190.39.149.34:8080 ok!
80.193.214.231:3128 ok!
201.211.125.90:8080 ok!
120.84.236.37:8080 ok!
186.228.78.169:3128 ok!
181.48.62.75:8080 ok!
120.84.236.153:8080 ok!
80.193.214.233:3128 ok!
110.208.27.75:9000 ok!
82.207.112.44:3128 ok!
27.44.79.8:8080 ok!
120.84.236.168:8080 ok!
186.93.190.202:8080 ok!
200.84.108.16:8080 ok!
120.84.234.103:8080 ok!
190.7.144.75:8080 ok!
177.19.162.52:3128 ok!
42.121.106.82:8088 failed!
117.171.115.106:8123 failed!
190.204.130.231:8080 ok!
117.163.116.154:8123 failed!
190.204.109.105:8080 ok!
190.36.224.223:8080 ok!
183.218.49.115:8123 failed!
202.116.1.149:8128 ok!
time: 15s

306
tmp/,scrap_result Normal file
View File

@ -0,0 +1,306 @@
183.221.190.202:8123
182.235.175.176:8088
114.66.219.90:8080
190.204.130.231:8080
37.187.97.36:3128
129.74.74.15:3128
110.77.228.131:3128
190.78.166.144:8080
186.101.23.235:3128
124.78.81.53:8080
41.42.241.35:8080
192.254.70.245:8080
193.188.95.146:8080
120.84.236.153:8080
202.70.136.158:3128
198.27.97.214:7808
173.9.143.222:7004
209.141.46.196:8888
218.18.29.223:9000
111.67.74.178:8000
175.41.246.120:17403
115.84.242.84:8080
81.34.161.21:8080
120.84.234.103:8080
186.228.78.169:3128
93.115.46.10:8080
64.78.169.114:3128
166.114.6.34:3128
108.177.171.106:33948
31.193.118.122:8080
223.84.10.112:8123
207.173.172.98:8000
183.89.42.206:3128
200.90.77.247:8080
65.98.100.220:3128
41.32.136.74:808
194.29.178.14:3127
58.137.158.104:8080
202.116.1.149:8128
118.99.114.113:3128
202.97.159.227:8080
117.40.160.45:3128
186.228.78.177:3128
186.91.134.139:8080
124.47.57.2:9000
80.193.214.233:3128
212.248.78.114:8080
77.95.194.142:3128
190.78.10.189:8080
189.208.57.239:1080
27.44.79.8:8080
207.236.90.180:8080
137.135.97.79:8080
220.166.64.73:1080
220.132.152.102:3128
218.204.89.117:8123
113.106.19.28:3128
64.34.14.28:7808
182.253.51.223:8080
200.97.98.171:8080
190.202.220.242:8080
58.20.127.100:3128
181.114.59.203:8080
58.246.43.122:8080
190.203.40.191:8080
177.99.244.38:8080
183.216.167.138:8123
199.241.28.233:8080
80.193.214.231:3128
41.191.237.233:8080
200.84.73.121:8080
137.135.98.170:8080
201.242.88.110:8080
200.93.93.205:8080
180.173.85.204:8080
181.48.62.75:8080
190.77.196.240:8080
203.178.133.10:3124
162.243.50.42:3128
41.220.19.157:3128
202.175.83.183:808
41.223.53.170:443
68.48.33.47:3128
202.146.144.30:8080
180.245.66.251:8080
202.185.100.225:3128
180.183.137.157:3128
112.65.19.122:8080
186.3.52.166:3128
182.253.242.33:3128
213.180.75.122:2023
61.160.126.157:3128
115.156.165.3:8080
186.90.181.166:8080
61.156.235.172:9999
137.117.71.160:8080
41.46.215.190:8080
202.51.117.22:8888
27.46.21.67:9999
204.86.209.115:8080
128.42.142.41:3124
180.183.239.22:3128
179.210.21.241:3128
172.16.0.20:3128
222.223.127.130:808
190.37.163.156:8080
188.111.69.178:8080
120.84.236.168:8080
111.192.57.101:9000
118.163.243.40:8080
190.36.49.69:8080
125.124.115.193:8080
195.190.117.50:3128
201.209.103.253:8080
200.84.60.173:8080
202.91.13.124:8080
95.110.196.114:3128
201.211.125.90:8080
125.162.149.223:8080
117.170.220.29:8123
117.162.152.40:8123
83.146.70.81:3128
201.243.204.70:8080
194.36.10.156:3127
110.74.197.26:8080
117.164.169.181:8123
180.183.154.52:3128
95.138.163.86:8001
203.155.205.4:3128
77.45.132.127:3128
201.49.209.147:3128
216.165.109.79:3127
190.36.224.223:8080
66.146.193.31:8118
190.74.182.156:8080
186.226.172.91:8080
201.159.23.150:3128
82.207.112.44:3128
183.178.23.103:3128
195.175.201.242:8080
208.113.228.217:5555
103.244.2.114:3128
175.103.42.218:3128
220.164.108.3:1080
177.38.40.11:3128
189.59.219.202:8080
65.98.100.210:3128
108.178.200.46:8080
182.253.73.142:8080
177.19.162.52:3128
202.98.123.126:8080
117.171.242.248:8123
190.7.144.75:8080
219.150.205.35:8080
62.111.208.195:123
190.39.149.34:8080
120.84.236.37:8080
176.205.213.147:8118
223.84.16.39:8123
117.163.116.154:8123
109.233.215.166:8080
177.69.67.253:3128
176.223.101.66:8080
67.17.38.72:3128
177.69.195.4:3128
220.173.59.36:8080
118.114.77.116:8080
195.113.161.83:3124
201.38.204.210:8080
216.12.29.195:8080
190.37.34.192:8080
204.84.216.200:3128
118.99.84.141:8080
183.216.249.12:8123
190.128.234.130:3128
117.171.115.106:8123
109.194.65.175:3128
222.88.240.27:9999
106.187.38.45:3128
110.208.27.82:9000
202.52.152.210:8080
118.174.149.118:8080
180.183.66.253:8080
110.208.27.114:9000
184.107.204.85:3128
183.218.49.115:8123
194.45.222.17:255
87.236.208.153:3128
200.84.108.16:8080
118.189.1.186:3128
218.58.136.14:808
117.162.205.125:8123
105.236.66.187:3128
219.93.174.104:553
220.110.137.44:8080
190.42.142.22:8080
187.188.195.66:8080
159.226.61.60:3128
186.93.190.202:8080
190.142.106.156:8080
133.11.240.57:3127
61.178.178.159:9999
186.228.78.130:3128
117.171.57.130:8123
182.253.32.100:8080
190.204.109.105:8080
128.42.142.43:3124
195.68.114.9:8080
117.177.195.38:8123
183.221.160.48:8123
137.99.11.87:3124
42.121.106.82:8088
172.16.0.10:3128
37.59.81.65:443
173.208.110.98:34061
201.242.90.177:8080
91.121.158.63:3128
122.49.12.186:1080
140.247.60.126:3127
58.64.168.208:3128
31.170.178.2:8080
180.211.159.138:8080
129.82.12.188:3124
124.81.121.238:8080
115.29.161.178:8000
109.70.145.125:3040
110.137.40.52:8080
117.170.206.111:8123
183.220.195.112:8123
210.14.147.71:3128
110.208.27.75:9000
110.208.26.35:9000
183.219.94.247:8123
201.208.106.200:8080
87.194.10.38:443
5.178.96.125:3128
118.96.172.6:3128
190.73.156.135:8080
199.185.61.3:8081
190.235.148.246:3128
1.2.3.4:5678
219.61.100.24:3128
117.121.242.8:18888
187.95.112.243:3128
190.198.162.171:8080
200.82.248.86:8080
190.75.142.153:8080
190.79.159.101:8080
137.135.99.5:8080
5.135.42.105:3128
218.104.148.59:3128
117.169.239.163:8123
122.129.118.186:3128
61.153.236.30:8080
218.249.83.87:8080
186.5.102.162:8080
69.147.64.31:209
190.39.38.57:3128
118.142.19.39:1180
200.109.33.50:8080
203.151.44.66:8080
88.248.183.22:8080
190.120.251.154:8080
182.253.35.57:8080
223.86.18.109:8123
190.40.54.245:8080
91.121.136.186:9999
187.59.2.83:3128
137.135.97.7:8080
110.208.27.178:9000
186.215.80.218:3128
174.129.196.16:8080
109.73.70.165:5005
85.64.202.69:29991
219.94.87.123:8080
219.72.230.2:1080
129.82.12.188:3128
117.170.197.19:8123
109.73.70.165:7080
175.140.44.162:8080
146.57.249.98:3124
190.253.89.124:8080
171.100.122.119:3128
186.228.78.133:3128
212.56.195.190:8080
190.36.136.204:8080
202.29.214.2:3129
192.254.128.172:8080
183.217.162.173:8123
61.164.184.66:8090
112.45.120.143:8123
117.164.164.143:8123
219.137.229.146:9999
82.200.164.226:3128
190.77.2.71:8080
218.18.128.108:9000
109.196.127.194:8080
120.206.144.186:8123
120.72.84.192:8080
211.41.55.136:123
180.246.216.194:3128
117.169.231.61:8123
222.180.173.3:8080
192.80.153.126:8080
117.170.223.129:8123
77.78.116.86:3128