接上面的http_get函数
print S $ENV{'REQUEST_METHOD'}, ' ', $request_uri, " HTTP/$HTTP_VERSION\015\012",
'Host: ', $host, $portst, "\015\012", # needed for multi-homed servers
'Accept: ', $env_accept, "\015\012", # possibly modified
'User-Agent: ', $USER_AGENT || $ENV{'HTTP_USER_AGENT'}, "\015\012",
$proxy_auth_header ; # empty if not needed
# Create Referer: header if so configured.
# Only include Referer: if we successfully remove $script_url+flags from
# start of referring URL. Note that flags may not always be there.
# If using @PROXY_GROUP, loop through them until one fits. This could
# only be ambiguous if one proxy in @PROXY_GROUP is called through
# another proxy in @PROXY_GROUP, which you really shouldn't do anyway.
if (!$e_hide_referer) {
my($referer)= $ENV{'HTTP_REFERER'} ;
if (@PROXY_GROUP) {
foreach (@PROXY_GROUP) {
print(S 'Referer: ', &proxy_decode($referer), "\015\012"), last
if $referer=~ s#^$_(/[^/]*/?)?## && ($referer ne '') ;
last if $referer eq '' ;
}
} else {
print S 'Referer: ', &proxy_decode($referer), "\015\012"
if $referer=~ s#^$THIS_SCRIPT_URL(/[^/]*/?)?## && ($referer ne '') ;
}
}
# Add "Connection: close" header if we're using HTTP 1.1.
print S "Connection: close\015\012" if $HTTP_VERSION eq '1.1' ;
# Add the cookie if it exists and cookies aren't banned here.
print S 'Cookie: ', $cookie_to_server, "\015\012"
if !$cookies_are_banned_here && ($cookie_to_server ne '') ;
# Add Pragma: and Cache-Control: headers if they were given in the
# request, to allow caches to behave properly. These two headers
# need no modification.
# As explained above, we can't rely on request headers being provided
# to the script via environment variables.
print S "Pragma: $ENV{HTTP_PRAGMA}\015\012" if $ENV{HTTP_PRAGMA} ne '' ;
print S "Cache-Control: $ENV{HTTP_CACHE_CONTROL}\015\012"
if $ENV{HTTP_CACHE_CONTROL} ne '' ;
# Add Authorization: header if we've had a challenge.
if ($realm ne '') {
# If we get here, we know $realm has a defined $auth and has not
# been tried.
print S 'Authorization: Basic ', $auth{$realm}, "\015\012" ;
$tried_realm= $realm ;
} else {
# If we have auth information for this server, what the hey, let's
# try one, it may save us a request/response cycle.
# First case is for rare case when auth info is in URL. Related
# block 100 lines down needs no changes.
if ($username ne '') {
print S 'Authorization: Basic ',
&base64($username . ':' . $password),
"\015\012" ;
} elsif ( ($tried_realm,$auth)= each %auth ) {
print S 'Authorization: Basic ', $auth, "\015\012" ;
}
}
# A little problem with authorization and POST requests: If auth
# is required, we won't know which realm until after we make the
# request and get part of the response. But to make the request,
# we have to send the entire POST body, because some servers
# mistakenly require that before returning even an error response.
# So this means we have to send the entire POST body, and be
# prepared to send it a second time, thus we have to store it
# locally. Either that, or fail to send the POST body a second
# time. Here, we let the owner of this proxy set $MAX_REQUEST_SIZE:
# store and post a second time if a request is smaller, or else
# die with 413 the second time through.
# If request method is POST, copy content headers and body to request.
# The first time through here, save body to @postbody, if the body's
# not too big.
if ($ENV{'REQUEST_METHOD'} eq 'POST') {
if ($body_too_big) {
# Quick 'n' dirty response for an unlikely occurrence.
# 413 is not actually an HTTP/1.0 response...
&HTMLdie("Sorry, this proxy can't handle a request larger "
. "than $MAX_REQUEST_SIZE bytes at a password-protected"
. " URL. Try reducing your submission size, or submit "
. "it to an unprotected URL.", 'Submission too large',
'413 Request Entity Too Large') ;
}
# Otherwise...
$lefttoget= $ENV{'CONTENT_LENGTH'} ;
print S 'Content-type: ', $ENV{'CONTENT_TYPE'}, "\015\012",
'Content-length: ', $lefttoget, "\015\012\015\012" ;
if (@postbody) {
print S @postbody ;
} else {
$body_too_big= ($lefttoget > $MAX_REQUEST_SIZE) ;
# Loop to guarantee all is read from STDIN.
do {
$lefttoget-= read(STDIN, $postblock, $lefttoget) ;
print S $postblock ;
# efficient-- only doing test when input is slow anyway.
push(@postbody, $postblock) unless $body_too_big ;
} while $lefttoget && ($postblock ne '') ;
}
# For GET or HEAD requests, just add extra blank line.
} else {
print S "\015\012" ;
}
# Wait a minute for the response to start
vec($rin= '', fileno(S), 1)= 1 ;
select($rin, undef, undef, 60)
|| &HTMLdie("No response from $realhost:$realport") ;
#------ Read full response into $status, $headers, and $body ----
# Support both HTTP 1.x and HTTP 0.9
$status= <S> ; # first line, which is the status line in HTTP 1.x
# HTTP 0.9
# Ignore possibility of HEAD, since it's not defined in HTTP 0.9.
# Do any HTTP 0.9 servers really exist anymore?
unless ($status=~ m#^HTTP/#) {
$is_html= 1 ; # HTTP 0.9 by definition implies an HTML response
$content_type= 'text/html' ;
undef $/ ;
$body= $status . <S> ;
$status= '' ;
close(S) ;
untie(*S) if $scheme eq 'https' ;
return ;
}
# After here, we know we're using HTTP 1.x
# Be sure to handle case when server doesn't send blank line! It's
# rare and erroneous, but a couple servers out there do that when
# responding with a redirection. This can cause some processes to
# linger and soak up resources, particularly under mod_perl.
# To handle this, merely check for eof(S) in until clause below.
# ... except that for some reason invoking eof() on a tied SSL_Handle
# makes later read()'s fail with unlikely error messages. :(
# So instead of eof(S), test "$_ eq ''".
# Loop to get $status and $headers until we get a non-100 response.
do {
($status_code)= $status=~ m#^HTTP/\d+\.\d+\s+(\d+)# ;
$headers= '' ; # could have been set by first attempt
do {
$headers.= $_= <S> ; # $headers includes last blank line
# } until (/^(\015\012|\012)$/) || eof(S) ; # lines end w/ LF or CRLF
} until (/^(\015\012|\012)$/) || $_ eq '' ; #lines end w/ LF or CRLF
$status= <S> if $status_code == 100 ; # re-read for next iteration
} until $status_code != 100 ;
# Unfold long header lines, a la RFC 822 section 3.1.1
$headers=~ s/(\015\012|\012)[ \t]+/ /g ;
# Check for 401 Unauthorized response
if ($status=~ m#^HTTP/\d+\.\d+\s+401\b#) {
($realm)=
$headers=~ /^WWW-Authenticate:\s*Basic\s+realm="([^"\n]*)/mi ;
&HTMLdie("Error by target server: no WWW-Authenticate header.")
unless $realm ne '' ;
if ($auth{$realm} eq '') {
&get_auth_from_user($host, $realm, $URL) ;
} elsif ($realm eq $tried_realm) {
&get_auth_from_user($host, $realm, $URL, 1) ;
}
# so now $realm exists, has defined $auth, and has not been tried
close(S) ;
untie(*S) if $scheme eq 'https' ;
redo HTTP_GET ;
}
# Extract $content_type, used in several places
($content_type)= $headers=~ m#^Content-Type:\s*([\w/.+\$-]*)#mi ;
$content_type= lc($content_type) ;
# If we're text only, then cut off non-text responses (but allow
# unspecified types).
if ($TEXT_ONLY) {
if ( ($content_type ne '') && ($content_type!~ m#^text/#i) ) {
&non_text_die ;
}
}
# If we're removing scripts, then disallow script MIME types.
if ($scripts_are_banned_here) {
&script_content_die if $content_type=~ /^$SCRIPT_TYPE_REGEX$/io ;
# Note that the non-standard Link: header, which may link to a
# style sheet, is handled in http_fix().
}
# If URL matches one of @BANNED_IMAGE_URL_PATTERNS, then skip the
# resource unless it's clearly a text type.
if ($images_are_banned_here) {
&skip_image unless $content_type=~ m#^text/#i ;
}
# Keeping $base_url, $base_host, and $base_path up-to-date is an
# ongoing job. Here, we look in appropriate headers. Note that if
# Content-Base: doesn't exist, Content-Location: is an absolute URL.
if ($headers=~ m#^Content-Base:\s*([\w+.-]+://\S+)#mi) {
$base_url= $1, &fix_base_vars ;
} elsif ($headers=~ m#^Content-Location:\s*([\w+.-]+://\S+)#mi) {
$base_url= $1, &fix_base_vars ;
} elsif ($headers=~ m#^Location:\s*([\w+.-]+://\S+)#mi) {
$base_url= $1, &fix_base_vars ;
}
# Now, fix the headers with &http_fix(). It uses &full_url(), and
# may modify the headers we just extracted the base URL from.
# This also includes cookie support.
&http_fix ;
# If configured, make this response as non-cacheable as possible.
# This means remove any Expires: and Pragma: headers (the latter
# could be using extensions), strip Cache-Control: headers of any
# unwanted directives and add the "no-cache" directive, and add back
# to $headers the new Cache-Control: header and a "Pragma: no-cache"
# header.
# A lot of this is documented in the HTTP 1.1 spec, sections 13 as a
# whole, 13.1.3, 13.4, 14.9, 14.21, and 14.32. The Cache-Control:
# response header has eight possible directives, plus extensions;
# according to section 13.4, all except "no-cache", "no-store", and
# "no-transform" might indicate cacheability, so remove them. Remove
# extensions for the same reason. Remove any parameter from
# "no-cache", because that would limit its effect. This effectively
# means preserve only "no-store" and "no-transform" if they exist
# (neither have parameters), and add "no-cache".
# We use a quick method here that works for all but cases both faulty
# and obscure, but opens no privacy holes; in the future we may fully
# parse the header value(s) into its comma-separated list of
# directives.
if ($MINIMIZE_CACHING) {
my($new_value)= 'no-cache' ;
$new_value.= ', no-store'
if $headers=~ /^Cache-Control:.*?\bno-store\b/mi ;
$new_value.= ', no-transform'
if $headers=~ /^Cache-Control:.*?\bno-transform\b/mi ;
my($no_cache_headers)=
"Cache-Control: $new_value\015\012Pragma: no-cache\015\012" ;
$headers=~ s/^Cache-Control:[^\012]*\012?//mig ;
$headers=~ s/^Pragma:[^\012]*\012?//mig ;
$headers=~ s/^Expires:[^\012]*\012?//mig ;
$headers= $no_cache_headers . $headers ;
}
# Set $is_html if headers indicate HTML response.
# Question: are there any other HTML-like MIME types, including x-... ?
$is_html= 1 if $content_type eq 'text/html' ;
# Some servers return HTML content without the Content-Type: header.
# These MUST be caught, because Netscape displays them as HTML, and
# a user could lose their anonymity on these pages.
# According to the HTTP 1.1 spec, section. 7.2.1, browsers can choose
# how to deal with HTTP bodies with no Content-Type: header. See
# http://www.ietf.org/rfc/rfc2616.txt
# In such a case, Netscape seems to always assume "text/html".
# Konqueror seems to guess the MIME type by using the Unix "file"
# utility on the first 1024 bytes, and possibly other clues (e.g.
# resource starts with "<h1>").
# In any case, we must interpret as HTML anything that *may* be
# interpreted as HTML by the browser. So if there is no
# Content-Type: header, set $is_html=1 . The worst that would
# happen would be the occasional content mangled by modified URLs,
# which is better than a privacy hole.
$is_html= 1 if ($content_type eq '') ;
# To support non-NPH hack, replace first part of $status with
# "Status:" if needed.
$status=~ s#^\S+#Status:# if $NOT_RUNNING_AS_NPH ;
# To support streaming media and large files, read the data from
# the server and send it immediately to the client. The exception
# is HTML content, which still must be read fully to be converted
# in the main block. HTML content is not normally streaming or
# very large.
# This requires $status and $headers to be returned now, which is
# OK since headers have been completely cleaned up by now. This
# also means that changes after this point to $body won't
# have any effect, which in fact is fine in the case of non-HTML
# resources. Set $response_sent to prevent the main block from
# sending a response.
# Also, handle any non-HTML types here which must be proxified.
# This is a bit sloppy now, just a quick hack to get rudimentary
# handling of multiple types working and released. It will be
# rewritten more cleanly at some point, when the whole proxifying
# of different types is modularized better.
# Only read body if the request method is not HEAD
if ($ENV{'REQUEST_METHOD'} ne 'HEAD') {
# Because of the erroneous way some browsers use the expected
# MIME type instead of the actual Content-Type: header, check
# $expected_type first.
# Since style sheets tend to be automatically loaded, whereas other
# types (like scripts) are more user-selected, plus the fact that
# CSS can be safely proxified and scripts cannot, we treat a
# resource as CSS if it *may* be treated as CSS by the browser.
# This is relevant when $expected_type and Content-Type: differ.
if ( ($expected_type=~ /^$TYPES_TO_HANDLE_REGEX$/io)
|| ($content_type=~ /^$TYPES_TO_HANDLE_REGEX$/io) ) {
my($type) ;
if ( ($expected_type eq 'text/css') || ($content_type eq 'text/css') ) {
$type= 'text/css' ;
} elsif ($expected_type=~ /^$TYPES_TO_HANDLE_REGEX$/io) {
$type= $expected_type ;
} else {
$type= $content_type ;
}
# If response is chunked, then dechunk it before processing.
# Not perfect (it loses the benefit of chunked encoding), but it
# works and will seldom be a problem. Chunked encoding won't
# often be used for the MIME types we're proxifying anyway.
# Append $footers into $headers, and remove any Transfer-Encoding: header.
if ($headers=~ /^Transfer-Encoding:[ \t]*chunked\b/mi) {
($body, $footers)= &get_chunked_body('S') ;
&HTMLdie(&HTMLescape("Error reading chunked response from $URL ."))
unless defined($body) ;
$headers=~ s/^Transfer-Encoding:[^\012]*\012?//mig ;
$headers=~ s/^(\015\012|\012)/$footers$1/m ;
# If not chunked, read entire input into $body.
} else {
undef $/ ;
$body= <S> ;
}
$body= &proxify_block($body, $type) ;
$headers=~ s/^Content-Length:.*/
'Content-Length: ' . length($body) /mie ;
print $status, $headers, $body ;
$response_sent= 1 ;
} elsif ($is_html) {
# If response is chunked, handle as above; see comments there.
if ($headers=~ /^Transfer-Encoding:[ \t]*chunked\b/mi) {
($body, $footers)= &get_chunked_body('S') ;
&HTMLdie(&HTMLescape("Error reading chunked response from $URL ."))
unless defined($body) ;
$headers=~ s/^Transfer-Encoding:[^\012]*\012?//mig ;
$headers=~ s/^(\015\012|\012)/$footers$1/m ;
# If not chunked, read entire input into $body.
} else {
undef $/ ;
$body= <S> ;
}
# This is for when the resource is passed straight through without
# modification.
# We don't care whether it's chunked or not here.
} else {
my($buf) ;
print $status, $headers ;
# If using SSL, read() could return 0 and truncate data. :P
print $buf while read(S, $buf, 16384) ;
$response_sent= 1 ;
}
} else {
$body= '' ;
}
close(S) ;
untie(*S) if $scheme eq 'https' ;
} # HTTP_GET:
} # sub http_get()