纯PERL 实现LWP抓取网页功能
纯PERL 实现LWP抓取网页功能
##注意,此函数用到
#纯PERL 实现简单TCP客户端 的 makeClient 函数 并且由于协议原因出现一些问题,希望有人能为我改进
#=================================================================
# getUrl usage:$info=getUrl($url,$data,$cookie,$method);
#=================================================================
sub getUrl(){
my ($url,$data,$cookie,$method)=@_;
my $request;
$cookie="Cookie: ". $cookie . "\r\n" if $cookie;
$method='GET' unless $method;
#处理url得到主机名和网页路径
$url=~s|^http://||i;
my ($host,$page)=$url=~m|(.*?)/(.*)|;
$host=$url unless $host;
my $server_port=80;
$server_port=$1 if $host=~s/:(.*)//;
#连接主机
my $hostsock=makeClient($host,$server_port);
binmode($hostsock);
unless($data){
$request=qq~GET /$page HTTP/1.1\r\nAccept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)\r\nHost: $host:80\r\nConnection: Keep-Alive\r\n$cookie\r\n~;
}
else{
if($method eq 'GET'){
$request=qq~GET /$page?$data HTTP/1.1\r\nAccept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)\r\nHost: $host:80\r\nConnection: Keep-Alive\r\n$cookie\r\n~;
}else{
my $length= length $data;
$request=qq~POST /$page HTTP/1.1\r\nAccept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)\r\nHost: $host:80\r\nContent-Length: $length\r\nConnection: Keep-Alive\r\n$cookie\r\n$data~;
}
}
send($hostsock,$request,0);
my $get;
my $getAll;
#测试I/O是否有可用信息
#vec($bits,fileno($hostsock),1) = 1;
#select($bits, undef,undef, 0.01);
my $bits;
vec($bits,fileno($hostsock),1) = 1;
select($bits, undef,undef, 0.01);
while(select($bits, undef,undef, 0)){
recv($hostsock,$get,5000,0);
last unless $get;
$getAll .= $get;
last if $get=~/\r\n\r\n$/;
last if $get=~/0\r\n\r\n$/;
}
close($hostsock);
return $getAll;
}
#=================================================================
###########################
##made by MR.WATER QQ:175 785 113 #
###########################